Evaluation Metrics
Standardized evaluation metrics for reasoning quality, orchestration performance, and overall agent effectiveness with automated scoring and benchmarking.
Features
- Comprehensive Metrics: Accuracy, precision, recall, F1, latency, throughput
- Domain-Specific Evaluation: Reasoning, orchestration, tool usage, LLM performance
- Automated Scoring: ML-based evaluation with confidence scores
- Benchmarking: Compare against baseline performance
- Custom Metrics: Add domain-specific evaluation metrics
- Trend Analysis: Track performance over time
Quick Start
from packages.observability import EvaluationMetrics, EvaluationDomain, MetricType
# Initialize evaluation metrics
metrics = EvaluationMetrics(
enable_benchmarking=True,
benchmark_data={
"reasoning_baseline": 0.85,
"orchestration_baseline": 0.90
}
)
# Evaluate reasoning
result = metrics.evaluate_reasoning(
query="What is machine learning?",
expected_answer="Machine learning is a subset of AI...",
actual_answer="Machine learning is a method of data analysis...",
reasoning_trace=["Step 1: Identify the question", "Step 2: Recall knowledge"]
)
print(f"Overall Score: {result.overall_score:.2f}")
print(f"Confidence: {result.confidence:.2f}")
Evaluation Domains
Reasoning Quality
# Evaluate reasoning process
result = metrics.evaluate_reasoning(
query="Solve this math problem: 2x + 5 = 13",
expected_answer="x = 4",
actual_answer="x = 4",
reasoning_trace=[
"Step 1: Subtract 5 from both sides",
"Step 2: 2x = 8",
"Step 3: Divide by 2",
"Step 4: x = 4"
],
metadata={"difficulty": "medium", "subject": "algebra"}
)
# Get reasoning metrics
for metric in result.metrics:
print(f"{metric.metric_name}: {metric.value:.2f}")
Orchestration Effectiveness
# Evaluate orchestration
result = metrics.evaluate_orchestration(
workflow_name="data_processing_workflow",
execution_time=45.2,
success_rate=0.95,
node_performance={
"data_validation": 0.98,
"data_transformation": 0.92,
"data_export": 0.99
},
metadata={"data_size": 10000, "complexity": "high"}
)
print(f"Orchestration Score: {result.overall_score:.2f}")
Tool Usage
# Evaluate tool execution
result = metrics.evaluate_tool_usage(
tool_name="web_search",
execution_time=2.5,
success=True,
input_complexity=7.0,
output_quality=0.9,
metadata={"query_type": "factual", "sources_found": 5}
)
print(f"Tool Score: {result.overall_score:.2f}")
LLM Performance
# Evaluate LLM performance
result = metrics.evaluate_llm_performance(
model="gpt-4",
prompt="Explain quantum computing",
response="Quantum computing is a type of computation...",
expected_response="Quantum computing uses quantum mechanical phenomena...",
latency=1.2,
cost=0.05,
metadata={"temperature": 0.7, "max_tokens": 500}
)
print(f"LLM Score: {result.overall_score:.2f}")
Metric Types
Accuracy Metrics
# Calculate accuracy
accuracy = metrics._calculate_accuracy(
expected=["correct answer"],
actual=["correct answer"]
)
print(f"Accuracy: {accuracy:.2f}")
# Calculate precision
precision = metrics._calculate_precision(
expected=["positive", "negative", "positive"],
actual=["positive", "positive", "positive"]
)
print(f"Precision: {precision:.2f}")
# Calculate recall
recall = metrics._calculate_recall(
expected=["positive", "negative", "positive"],
actual=["positive", "positive", "positive"]
)
print(f"Recall: {recall:.2f}")
# Calculate F1 score
f1 = metrics._calculate_f1_score(
expected=["positive", "negative", "positive"],
actual=["positive", "positive", "positive"]
)
print(f"F1 Score: {f1:.2f}")
Performance Metrics
# Calculate latency score
latency_score = metrics._calculate_latency(execution_time=2.5)
print(f"Latency Score: {latency_score:.2f}")
# Calculate throughput
throughput_score = metrics._calculate_throughput(
items_processed=100,
time_seconds=10.0
)
print(f"Throughput Score: {throughput_score:.2f}")
# Calculate cost efficiency
cost_efficiency = metrics._calculate_cost_efficiency(
cost=0.05,
output_quality=0.9
)
print(f"Cost Efficiency: {cost_efficiency:.2f}")
Quality Metrics
# Calculate reasoning quality
reasoning_quality = metrics._calculate_reasoning_quality(
query="What is AI?",
answer="AI is artificial intelligence...",
reasoning_trace=["Step 1", "Step 2", "Step 3"]
)
print(f"Reasoning Quality: {reasoning_quality:.2f}")
# Calculate orchestration effectiveness
orchestration_score = metrics._calculate_orchestration_effectiveness(
success_rate=0.95,
node_performance={"node1": 0.9, "node2": 0.95}
)
print(f"Orchestration Score: {orchestration_score:.2f}")
# Calculate user satisfaction
satisfaction = metrics._calculate_user_satisfaction(
user_feedback=0.85,
response_time=1.5,
accuracy=0.92
)
print(f"User Satisfaction: {satisfaction:.2f}")
Benchmarking
Set Benchmarks
# Set benchmark data
metrics = EvaluationMetrics(
enable_benchmarking=True,
benchmark_data={
"reasoning_baseline": 0.85,
"orchestration_baseline": 0.90,
"tool_usage_baseline": 0.88,
"llm_performance_baseline": 0.92
}
)
Compare with Benchmarks
# Get benchmark comparison
comparison = metrics.get_benchmark_comparison(
domain=EvaluationDomain.REASONING,
current_score=0.92
)
if comparison:
print(f"Current Score: {comparison.current_score:.2f}")
print(f"Baseline Score: {comparison.baseline_score:.2f}")
print(f"Improvement: {comparison.improvement:.2f}")
print(f"Percentile: {comparison.percentile:.1f}%")
Evaluation Summary
Get Summary
# Get evaluation summary
summary = metrics.get_evaluation_summary(
domain=EvaluationDomain.REASONING,
days=30
)
print(f"Total Evaluations: {summary['total_evaluations']}")
print(f"Average Score: {summary['average_score']:.2f}")
print(f"Trend: {summary['trend']}")
# By domain
for domain, score in summary['by_domain'].items():
print(f"{domain}: {score:.2f}")
# By metric
for metric, score in summary['by_metric'].items():
print(f"{metric}: {score:.2f}")
Trend Analysis
# Analyze trends
summary = metrics.get_evaluation_summary(days=30)
if summary['trend'] == 'improving':
print("📈 Performance is improving")
elif summary['trend'] == 'declining':
print("📉 Performance is declining")
else:
print("📊 Performance is stable")
Custom Metrics
Add Custom Metric
# Add custom metric calculator
def custom_metric_calculator(expected, actual, **kwargs):
# Custom calculation logic
return custom_score
metrics.add_custom_metric(
name="custom_metric",
calculator=custom_metric_calculator,
metric_type=MetricType.CUSTOM
)
# Use custom metric
result = metrics.evaluate_reasoning(
query="test",
expected_answer="expected",
actual_answer="actual"
)
Domain-Specific Metrics
# Add domain-specific metric
def domain_specific_metric(data):
# Calculate domain-specific score
return score
metrics.add_custom_metric(
name="domain_metric",
calculator=domain_specific_metric
)
Integration Examples
With Workflows
from packages.observability import trace_workflow, get_evaluation_metrics
@trace_workflow(name="evaluated_workflow")
async def my_workflow():
metrics = get_evaluation_metrics()
# Evaluate workflow performance
result = metrics.evaluate_orchestration(
workflow_name="my_workflow",
execution_time=30.0,
success_rate=1.0,
node_performance={"node1": 0.95}
)
return result
With Tools
from packages.observability import trace_tool, get_evaluation_metrics
@trace_tool(name="evaluated_tool")
async def my_tool():
metrics = get_evaluation_metrics()
# Evaluate tool performance
result = metrics.evaluate_tool_usage(
tool_name="my_tool",
execution_time=1.5,
success=True,
input_complexity=5.0,
output_quality=0.9
)
return result
With LLM Calls
from packages.observability import trace_llm_call, get_evaluation_metrics
@trace_llm_call(model="gpt-4")
async def llm_call(prompt):
metrics = get_evaluation_metrics()
# Evaluate LLM performance
result = metrics.evaluate_llm_performance(
model="gpt-4",
prompt=prompt,
response=response,
latency=1.2,
cost=0.05
)
return result
Advanced Configuration
Custom Benchmarks
# Set custom benchmarks
metrics = EvaluationMetrics(
enable_benchmarking=True,
benchmark_data={
"reasoning_baseline": 0.85,
"orchestration_baseline": 0.90,
"tool_usage_baseline": 0.88,
"llm_performance_baseline": 0.92,
"custom_baseline": 0.80
}
)
Evaluation History
# Get evaluation history
history = metrics.evaluation_history
for evaluation in history[-10:]: # Last 10 evaluations
print(f"Domain: {evaluation.domain}")
print(f"Score: {evaluation.overall_score:.2f}")
print(f"Confidence: {evaluation.confidence:.2f}")
print(f"Timestamp: {evaluation.timestamp}")
Best Practices
- Regular Evaluation: Evaluate performance regularly
- Set Benchmarks: Establish baseline performance metrics
- Track Trends: Monitor performance trends over time
- Use Appropriate Metrics: Choose metrics relevant to your domain
- Custom Metrics: Add domain-specific evaluation metrics
- Compare Performance: Compare against benchmarks and baselines
- Document Results: Keep records of evaluation results
Troubleshooting
Common Issues
- Low Scores: Check data quality and model performance
- Inconsistent Results: Ensure consistent evaluation criteria
- Missing Metrics: Verify all required data is provided
- Benchmark Issues: Check benchmark data accuracy
Debug Mode
# Enable debug logging
import logging
logging.getLogger('packages.observability.evaluation_metrics').setLevel(logging.DEBUG)
API Reference
EvaluationMetrics
| Parameter | Type | Description |
|---|---|---|
enable_benchmarking | bool | Enable benchmarking |
benchmark_data | Dict | Baseline benchmark data |
EvaluationDomain
| Value | Description |
|---|---|
REASONING | Reasoning quality evaluation |
ORCHESTRATION | Orchestration effectiveness |
TOOL_USAGE | Tool usage evaluation |
LLM_PERFORMANCE | LLM performance evaluation |
WORKFLOW_EFFECTIVENESS | Workflow effectiveness |
OVERALL | Overall evaluation |
MetricType
| Value | Description |
|---|---|
ACCURACY | Accuracy metric |
PRECISION | Precision metric |
RECALL | Recall metric |
F1_SCORE | F1 score metric |
LATENCY | Latency metric |
THROUGHPUT | Throughput metric |
COST_EFFICIENCY | Cost efficiency metric |
REASONING_QUALITY | Reasoning quality metric |
ORCHESTRATION_EFFECTIVENESS | Orchestration effectiveness |
USER_SATISFACTION | User satisfaction metric |
CUSTOM | Custom metric |