Learn how to add automated quality assessment to your AI applications using LLM-based evaluation. The judge works in two phases:
Define your evaluation structure and criteria
Use the judge to assess AI responses
1
Install Dependencies
Copy
Ask AI
pip install pixeltable openai
Define Your Evaluation Structure
Create table.py:
Copy
Ask AI
import pixeltable as pxtfrom pixeltable.functions import openai# Initialize app structurepxt.drop_dir("evaluations", force=True)pxt.create_dir("evaluations")# Define data schema with evaluation criteriaconversations = pxt.create_table( "evaluations.conversations", { "prompt": pxt.String, "expected_criteria": pxt.String }, if_exists="ignore")# Configure processing workflowconversations.add_computed_column( messages=[{"role": "user", "content": conversations.prompt}])conversations.add_computed_column( response=openai.chat_completions( messages=conversations.messages, model="gpt-4o-mini", ))conversations.add_computed_column( answer=conversations.response.choices[0].message.content)# Add judge evaluation workflowjudge_prompt_template = """You are an expert judge evaluating AI responses. Your task is to evaluate the following response based on the given criteria.Original Prompt: {prompt}Expected Criteria: {criteria}AI Response: {response}Please evaluate the response on a scale of 1-10 and provide a brief explanation.Format your response as:Score: [1-10]Explanation: [Your explanation]"""conversations.add_computed_column( judge_prompt=judge_prompt_template.format( prompt=conversations.prompt, criteria=conversations.expected_criteria, response=conversations.answer ))conversations.add_computed_column( judge_response=openai.chat_completions( messages=[ {"role": "system", "content": "You are an expert judge evaluating AI responses."}, {"role": "user", "content": conversations.judge_prompt} ], model="gpt-4o-mini", ))conversations.add_computed_column( evaluation=conversations.judge_response.choices[0].message.content)# Add score extraction@pxt.udfdef extract_score(evaluation: str) -> float: try: score_line = [line for line in evaluation.split('\n') if line.startswith('Score:')][0] return float(score_line.split(':')[1].strip()) except: return 0.0conversations.add_computed_column( score=extract_score(conversations.evaluation))
Use Your Judge
Create app.py:
Copy
Ask AI
import pixeltable as pxtdef run_evaluation(): # Connect to your app conversations = pxt.get_table("evaluations.conversations") # Example prompts with evaluation criteria test_cases = [ { "prompt": "Write a haiku about dogs.", "expected_criteria": """ The response should: 1) Follow 5-7-5 syllable pattern 2) Be about dogs 3) Use vivid imagery """ }, { "prompt": "Explain quantum computing to a 10-year-old.", "expected_criteria": """ The response should: 1) Use age-appropriate language 2) Use relevant analogies 3) Be engaging and clear """ } ] # Insert test cases conversations.insert(test_cases) # Get results with evaluations results = conversations.select( conversations.prompt, conversations.answer, conversations.evaluation, conversations.score ).collect().to_pandas() # Print results for idx, row in results.iterrows(): print(f"\nTest Case {idx + 1}") print("=" * 50) print(f"Prompt: {row['prompt']}") print(f"\nResponse: {row['answer']}") print(f"\nEvaluation:\n{row['evaluation']}") print(f"Score: {row['score']}") print("=" * 50)if __name__ == "__main__": run_evaluation()