Evaluation & Testing

Agent Benchmarks

Measure agent performance systematically.

Creating Test Suites

from dataclasses import dataclass
from typing import List, Callable
import time

@dataclass
class TestCase:
    """Single test case"""
    name: str
    input: str
    expected_output: str = None
    expected_behavior: str = None
    timeout: int = 30

@dataclass
class TestResult:
    """Test result"""
    test_name: str
    passed: bool
    actual_output: str
    expected_output: str
    execution_time: float
    error: str = None

class AgentTestSuite:
    """Test suite for agents"""
    
    def __init__(self, agent):
        self.agent = agent
        self.test_cases = []
        self.results = []
    
    def add_test(self, test_case: TestCase):
        """Add test case"""
        self.test_cases.append(test_case)
    
    def run_tests(self) -> dict:
        """Run all tests"""
        self.results = []
        
        for test in self.test_cases:
            print(f"Running: {test.name}...")
            result = self.run_single_test(test)
            self.results.append(result)
        
        return self.generate_report()
    
    def run_single_test(self, test: TestCase) -> TestResult:
        """Run single test"""
        start_time = time.time()
        
        try:
            # Execute agent
            actual_output = self.agent.process(test.input)
            execution_time = time.time() - start_time
            
            # Check result
            if test.expected_output:
                passed = self.check_output_match(actual_output, test.expected_output)
            elif test.expected_behavior:
                passed = self.check_behavior(actual_output, test.expected_behavior)
            else:
                passed = True  # Just check it doesn't crash
            
            return TestResult(
                test_name=test.name,
                passed=passed,
                actual_output=actual_output,
                expected_output=test.expected_output or test.expected_behavior,
                execution_time=execution_time
            )
            
        except Exception as e:
            execution_time = time.time() - start_time
            return TestResult(
                test_name=test.name,
                passed=False,
                actual_output="",
                expected_output=test.expected_output or test.expected_behavior,
                execution_time=execution_time,
                error=str(e)
            )
    
    def check_output_match(self, actual: str, expected: str) -> bool:
        """Check if output matches expected"""
        # Exact match
        if actual.strip() == expected.strip():
            return True
        
        # Contains expected
        if expected.lower() in actual.lower():
            return True
        
        return False
    
    def check_behavior(self, output: str, behavior: str) -> bool:
        """Check if output exhibits expected behavior"""
        # Use LLM to judge
        prompt = f"""Does this output exhibit the expected behavior?

Output: {output}

Expected behavior: {behavior}

Answer with just 'yes' or 'no':"""
        
        response = llm.generate(prompt).strip().lower()
        return response == 'yes'
    
    def generate_report(self) -> dict:
        """Generate test report"""
        total = len(self.results)
        passed = sum(1 for r in self.results if r.passed)
        failed = total - passed
        
        avg_time = sum(r.execution_time for r in self.results) / total if total > 0 else 0
        
        return {
            "total": total,
            "passed": passed,
            "failed": failed,
            "pass_rate": passed / total if total > 0 else 0,
            "avg_execution_time": avg_time,
            "results": self.results
        }

# Usage
suite = AgentTestSuite(agent)

suite.add_test(TestCase(
    name="Basic math",
    input="What is 2 + 2?",
    expected_output="4"
))

suite.add_test(TestCase(
    name="Tool usage",
    input="Search for information about Python",
    expected_behavior="Uses search tool and provides relevant information"
))

report = suite.run_tests()
print(f"Pass rate: {report['pass_rate']:.1%}")

Standard Benchmarks

class StandardBenchmarks:
    """Common agent benchmarks"""
    
    @staticmethod
    def get_math_benchmark() -> List[TestCase]:
        """Math reasoning tests"""
        return [
            TestCase("Addition", "What is 123 + 456?", "579"),
            TestCase("Multiplication", "What is 25 * 17?", "425"),
            TestCase("Word problem", "If I have 3 apples and buy 2 more, how many do I have?", "5"),
            TestCase("Percentage", "What is 15% of 200?", "30"),
        ]
    
    @staticmethod
    def get_reasoning_benchmark() -> List[TestCase]:
        """Logical reasoning tests"""
        return [
            TestCase(
                "Deduction",
                "All cats are animals. Fluffy is a cat. Is Fluffy an animal?",
                expected_behavior="Correctly deduces that Fluffy is an animal"
            ),
            TestCase(
                "Planning",
                "I need to make dinner. What steps should I take?",
                expected_behavior="Provides logical sequence of steps"
            ),
        ]
    
    @staticmethod
    def get_tool_usage_benchmark() -> List[TestCase]:
        """Tool usage tests"""
        return [
            TestCase(
                "Search",
                "Find information about the Eiffel Tower",
                expected_behavior="Uses search tool and provides facts"
            ),
            TestCase(
                "Calculation",
                "Calculate the compound interest on $1000 at 5% for 3 years",
                expected_behavior="Uses calculator tool"
            ),
        ]

Success Metrics

Define what success means for your agent.

Quantitative Metrics

class AgentMetrics:
    """Track agent performance metrics"""
    
    def __init__(self):
        self.metrics = {
            "total_requests": 0,
            "successful_requests": 0,
            "failed_requests": 0,
            "total_execution_time": 0,
            "tool_calls": 0,
            "tokens_used": 0,
            "cost": 0.0
        }
    
    def record_request(self, 
                      success: bool,
                      execution_time: float,
                      tool_calls: int = 0,
                      tokens: int = 0,
                      cost: float = 0.0):
        """Record request metrics"""
        self.metrics["total_requests"] += 1
        
        if success:
            self.metrics["successful_requests"] += 1
        else:
            self.metrics["failed_requests"] += 1
        
        self.metrics["total_execution_time"] += execution_time
        self.metrics["tool_calls"] += tool_calls
        self.metrics["tokens_used"] += tokens
        self.metrics["cost"] += cost
    
    def get_summary(self) -> dict:
        """Get metrics summary"""
        total = self.metrics["total_requests"]
        
        if total == 0:
            return self.metrics
        
        return {
            **self.metrics,
            "success_rate": self.metrics["successful_requests"] / total,
            "avg_execution_time": self.metrics["total_execution_time"] / total,
            "avg_tool_calls": self.metrics["tool_calls"] / total,
            "avg_tokens": self.metrics["tokens_used"] / total,
            "avg_cost": self.metrics["cost"] / total
        }
    
    def print_summary(self):
        """Print formatted summary"""
        summary = self.get_summary()
        
        print("Agent Performance Metrics")
        print("=" * 40)
        print(f"Total Requests: {summary['total_requests']}")
        print(f"Success Rate: {summary['success_rate']:.1%}")
        print(f"Avg Execution Time: {summary['avg_execution_time']:.2f}s")
        print(f"Avg Tool Calls: {summary['avg_tool_calls']:.1f}")
        print(f"Avg Tokens: {summary['avg_tokens']:.0f}")
        print(f"Avg Cost: ${summary['avg_cost']:.4f}")
        print(f"Total Cost: ${summary['cost']:.2f}")

Qualitative Metrics

class QualityEvaluator:
    """Evaluate response quality"""
    
    def __init__(self):
        self.client = openai.OpenAI()
    
    def evaluate_response(self, 
                         question: str,
                         response: str,
                         criteria: List[str] = None) -> dict:
        """Evaluate response quality"""
        
        if criteria is None:
            criteria = [
                "Accuracy: Is the information correct?",
                "Completeness: Does it fully answer the question?",
                "Clarity: Is it easy to understand?",
                "Relevance: Does it stay on topic?"
            ]
        
        scores = {}
        
        for criterion in criteria:
            score = self.score_criterion(question, response, criterion)
            criterion_name = criterion.split(':')[0]
            scores[criterion_name] = score
        
        return {
            "scores": scores,
            "average": sum(scores.values()) / len(scores),
            "passed": all(score >= 3 for score in scores.values())
        }
    
    def score_criterion(self, question: str, response: str, criterion: str) -> int:
        """Score response on single criterion (1-5)"""
        prompt = f"""Rate this response on the following criterion (1-5):

Question: {question}

Response: {response}

Criterion: {criterion}

Provide only a number from 1 (poor) to 5 (excellent):"""
        
        result = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )
        
        try:
            score = int(result.choices[0].message.content.strip())
            return max(1, min(5, score))  # Clamp to 1-5
        except:
            return 3  # Default to middle score

Unit and Integration Testing

Unit Tests for Components

import unittest

class TestAgentComponents(unittest.TestCase):
    """Unit tests for agent components"""
    
    def setUp(self):
        """Set up test fixtures"""
        self.agent = MyAgent()
    
    def test_input_validation(self):
        """Test input validation"""
        validator = InputValidator()
        
        # Valid input
        result = validator.validate_text_input("Hello world")
        self.assertTrue(result['valid'])
        
        # Invalid input (too long)
        long_text = "x" * 20000
        result = validator.validate_text_input(long_text)
        self.assertFalse(result['valid'])
    
    def test_tool_execution(self):
        """Test tool execution"""
        result = self.agent.execute_tool("calculate", {"expression": "2 + 2"})
        self.assertEqual(result, "4")
    
    def test_memory_storage(self):
        """Test memory system"""
        self.agent.memory.add("user_name", "Alice")
        retrieved = self.agent.memory.get("user_name")
        self.assertEqual(retrieved, "Alice")
    
    def test_error_handling(self):
        """Test error handling"""
        # Should not crash on invalid tool
        result = self.agent.execute_tool("nonexistent_tool", {})
        self.assertIn("error", result.lower())
    
    def tearDown(self):
        """Clean up"""
        pass

# Run tests
if __name__ == '__main__':
    unittest.main()

Integration Tests

class TestAgentIntegration(unittest.TestCase):
    """Integration tests for full agent"""
    
    def test_end_to_end_query(self):
        """Test complete query flow"""
        agent = MyAgent()
        
        response = agent.process("What is 2 + 2?")
        
        self.assertIsNotNone(response)
        self.assertIn("4", response)
    
    def test_multi_step_task(self):
        """Test multi-step task execution"""
        agent = MyAgent()
        
        response = agent.process("Search for Python tutorials and summarize the top result")
        
        # Should use search tool
        self.assertTrue(agent.tool_used("search"))
        
        # Should provide summary
        self.assertGreater(len(response), 50)
    
    def test_error_recovery(self):
        """Test error recovery"""
        agent = MyAgent()
        
        # Simulate tool failure
        agent.tools["search"] = lambda x: raise_error()
        
        response = agent.process("Search for something")
        
        # Should handle gracefully
        self.assertIsNotNone(response)
        self.assertNotIn("Traceback", response)
    
    def test_rate_limiting(self):
        """Test rate limiting"""
        agent = MyAgent()
        
        # Make many requests
        for i in range(150):
            response = agent.process(f"Request {i}")
        
        # Should be rate limited
        self.assertTrue(agent.was_rate_limited())

Property-Based Testing

from hypothesis import given, strategies as st

class TestAgentProperties(unittest.TestCase):
    """Property-based tests"""
    
    @given(st.text(min_size=1, max_size=1000))
    def test_agent_handles_any_text(self, text):
        """Agent should handle any text input without crashing"""
        agent = MyAgent()
        
        try:
            response = agent.process(text)
            # Should return something
            self.assertIsNotNone(response)
        except Exception as e:
            # Should not crash
            self.fail(f"Agent crashed on input: {text[:50]}... Error: {e}")
    
    @given(st.integers(min_value=-1000, max_value=1000))
    def test_calculator_tool(self, number):
        """Calculator should handle any integer"""
        agent = MyAgent()
        
        result = agent.execute_tool("calculate", {"expression": f"{number} + 1"})
        expected = str(number + 1)
        
        self.assertEqual(result, expected)

Human Evaluation Frameworks

Collecting Human Feedback

class HumanEvaluator:
    """Collect human evaluations"""
    
    def __init__(self):
        self.evaluations = []
    
    def request_evaluation(self, 
                          question: str,
                          response: str,
                          evaluator_id: str) -> dict:
        """Request human evaluation"""
        
        print(f"\n{'='*60}")
        print(f"Question: {question}")
        print(f"\nResponse: {response}")
        print(f"\n{'='*60}")
        
        # Collect ratings
        ratings = {}
        
        criteria = [
            ("accuracy", "Is the response accurate? (1-5)"),
            ("helpfulness", "Is the response helpful? (1-5)"),
            ("clarity", "Is the response clear? (1-5)"),
        ]
        
        for key, prompt in criteria:
            while True:
                try:
                    score = int(input(f"{prompt}: "))
                    if 1 <= score <= 5:
                        ratings[key] = score
                        break
                except ValueError:
                    pass
        
        # Collect feedback
        feedback = input("\nAdditional feedback (optional): ")
        
        evaluation = {
            "question": question,
            "response": response,
            "evaluator_id": evaluator_id,
            "ratings": ratings,
            "feedback": feedback,
            "timestamp": time.time()
        }
        
        self.evaluations.append(evaluation)
        return evaluation
    
    def get_summary(self) -> dict:
        """Get evaluation summary"""
        if not self.evaluations:
            return {}
        
        # Average ratings
        avg_ratings = {}
        for criterion in ["accuracy", "helpfulness", "clarity"]:
            scores = [e["ratings"][criterion] for e in self.evaluations]
            avg_ratings[criterion] = sum(scores) / len(scores)
        
        return {
            "total_evaluations": len(self.evaluations),
            "average_ratings": avg_ratings,
            "overall_score": sum(avg_ratings.values()) / len(avg_ratings)
        }

A/B Testing

class ABTest:
    """A/B test different agent versions"""
    
    def __init__(self, agent_a, agent_b):
        self.agent_a = agent_a
        self.agent_b = agent_b
        self.results = {"a": [], "b": []}
    
    def run_test(self, test_cases: List[str], evaluator) -> dict:
        """Run A/B test"""
        
        for i, test_case in enumerate(test_cases):
            # Alternate between agents
            if i % 2 == 0:
                agent = self.agent_a
                variant = "a"
            else:
                agent = self.agent_b
                variant = "b"
            
            # Get response
            response = agent.process(test_case)
            
            # Evaluate
            evaluation = evaluator.evaluate_response(test_case, response)
            
            self.results[variant].append(evaluation)
        
        return self.compare_results()
    
    def compare_results(self) -> dict:
        """Compare A vs B"""
        avg_a = sum(r["average"] for r in self.results["a"]) / len(self.results["a"])
        avg_b = sum(r["average"] for r in self.results["b"]) / len(self.results["b"])
        
        return {
            "agent_a_score": avg_a,
            "agent_b_score": avg_b,
            "winner": "a" if avg_a > avg_b else "b",
            "difference": abs(avg_a - avg_b)
        }

Automated Testing Pipeline

class TestPipeline:
    """Automated testing pipeline"""
    
    def __init__(self, agent):
        self.agent = agent
        self.test_suite = AgentTestSuite(agent)
        self.metrics = AgentMetrics()
        self.evaluator = QualityEvaluator()
    
    def run_full_pipeline(self) -> dict:
        """Run complete test pipeline"""
        results = {}
        
        # 1. Unit tests
        print("Running unit tests...")
        results["unit_tests"] = self.run_unit_tests()
        
        # 2. Integration tests
        print("Running integration tests...")
        results["integration_tests"] = self.run_integration_tests()
        
        # 3. Benchmark tests
        print("Running benchmarks...")
        results["benchmarks"] = self.run_benchmarks()
        
        # 4. Quality evaluation
        print("Running quality evaluation...")
        results["quality"] = self.run_quality_evaluation()
        
        # 5. Performance metrics
        print("Collecting performance metrics...")
        results["performance"] = self.metrics.get_summary()
        
        # 6. Generate report
        report = self.generate_report(results)
        
        return report
    
    def run_unit_tests(self) -> dict:
        """Run unit tests"""
        loader = unittest.TestLoader()
        suite = loader.loadTestsFromTestCase(TestAgentComponents)
        runner = unittest.TextTestRunner(verbosity=0)
        result = runner.run(suite)
        
        return {
            "total": result.testsRun,
            "passed": result.testsRun - len(result.failures) - len(result.errors),
            "failed": len(result.failures) + len(result.errors)
        }
    
    def run_integration_tests(self) -> dict:
        """Run integration tests"""
        loader = unittest.TestLoader()
        suite = loader.loadTestsFromTestCase(TestAgentIntegration)
        runner = unittest.TextTestRunner(verbosity=0)
        result = runner.run(suite)
        
        return {
            "total": result.testsRun,
            "passed": result.testsRun - len(result.failures) - len(result.errors),
            "failed": len(result.failures) + len(result.errors)
        }
    
    def run_benchmarks(self) -> dict:
        """Run benchmark tests"""
        # Add standard benchmarks
        for test in StandardBenchmarks.get_math_benchmark():
            self.test_suite.add_test(test)
        
        for test in StandardBenchmarks.get_reasoning_benchmark():
            self.test_suite.add_test(test)
        
        return self.test_suite.run_tests()
    
    def run_quality_evaluation(self) -> dict:
        """Run quality evaluation"""
        test_cases = [
            ("What is Python?", "Python is a high-level programming language..."),
            ("How do I sort a list?", "You can use the sorted() function..."),
        ]
        
        evaluations = []
        for question, response in test_cases:
            eval_result = self.evaluator.evaluate_response(question, response)
            evaluations.append(eval_result)
        
        avg_score = sum(e["average"] for e in evaluations) / len(evaluations)
        
        return {
            "evaluations": evaluations,
            "average_score": avg_score
        }
    
    def generate_report(self, results: dict) -> dict:
        """Generate comprehensive report"""
        return {
            "timestamp": time.time(),
            "summary": {
                "unit_tests_passed": results["unit_tests"]["passed"],
                "integration_tests_passed": results["integration_tests"]["passed"],
                "benchmark_pass_rate": results["benchmarks"]["pass_rate"],
                "quality_score": results["quality"]["average_score"],
                "success_rate": results["performance"]["success_rate"]
            },
            "details": results
        }

# Usage
pipeline = TestPipeline(agent)
report = pipeline.run_full_pipeline()

print("\nTest Report Summary")
print("=" * 40)
for key, value in report["summary"].items():
    print(f"{key}: {value}")

Best Practices

Test early and often: Continuous testing during development
Automate testing: Run tests automatically on changes
Use multiple metrics: Quantitative and qualitative
Test edge cases: Unusual inputs, errors, limits
Benchmark regularly: Track performance over time
Get human feedback: Automated tests aren’t enough
Test in production: Monitor real usage
Version your tests: Track test changes
Document failures: Learn from what breaks
Iterate based on results: Use tests to improve

Next Steps

You now understand evaluation and testing! Next, we’ll explore monitoring and observability for production agents.

Keyboard shortcuts

Agentic Guide to AI Agents