Skip to main content

Evaluation Metrics

Standardized evaluation metrics for reasoning quality, orchestration performance, and overall agent effectiveness with automated scoring and benchmarking.

Features

  • Comprehensive Metrics: Accuracy, precision, recall, F1, latency, throughput
  • Domain-Specific Evaluation: Reasoning, orchestration, tool usage, LLM performance
  • Automated Scoring: ML-based evaluation with confidence scores
  • Benchmarking: Compare against baseline performance
  • Custom Metrics: Add domain-specific evaluation metrics
  • Trend Analysis: Track performance over time

Quick Start

from packages.observability import EvaluationMetrics, EvaluationDomain, MetricType

# Initialize evaluation metrics
metrics = EvaluationMetrics(
enable_benchmarking=True,
benchmark_data={
"reasoning_baseline": 0.85,
"orchestration_baseline": 0.90
}
)

# Evaluate reasoning
result = metrics.evaluate_reasoning(
query="What is machine learning?",
expected_answer="Machine learning is a subset of AI...",
actual_answer="Machine learning is a method of data analysis...",
reasoning_trace=["Step 1: Identify the question", "Step 2: Recall knowledge"]
)

print(f"Overall Score: {result.overall_score:.2f}")
print(f"Confidence: {result.confidence:.2f}")

Evaluation Domains

Reasoning Quality

# Evaluate reasoning process
result = metrics.evaluate_reasoning(
query="Solve this math problem: 2x + 5 = 13",
expected_answer="x = 4",
actual_answer="x = 4",
reasoning_trace=[
"Step 1: Subtract 5 from both sides",
"Step 2: 2x = 8",
"Step 3: Divide by 2",
"Step 4: x = 4"
],
metadata={"difficulty": "medium", "subject": "algebra"}
)

# Get reasoning metrics
for metric in result.metrics:
print(f"{metric.metric_name}: {metric.value:.2f}")

Orchestration Effectiveness

# Evaluate orchestration
result = metrics.evaluate_orchestration(
workflow_name="data_processing_workflow",
execution_time=45.2,
success_rate=0.95,
node_performance={
"data_validation": 0.98,
"data_transformation": 0.92,
"data_export": 0.99
},
metadata={"data_size": 10000, "complexity": "high"}
)

print(f"Orchestration Score: {result.overall_score:.2f}")

Tool Usage

# Evaluate tool execution
result = metrics.evaluate_tool_usage(
tool_name="web_search",
execution_time=2.5,
success=True,
input_complexity=7.0,
output_quality=0.9,
metadata={"query_type": "factual", "sources_found": 5}
)

print(f"Tool Score: {result.overall_score:.2f}")

LLM Performance

# Evaluate LLM performance
result = metrics.evaluate_llm_performance(
model="gpt-4",
prompt="Explain quantum computing",
response="Quantum computing is a type of computation...",
expected_response="Quantum computing uses quantum mechanical phenomena...",
latency=1.2,
cost=0.05,
metadata={"temperature": 0.7, "max_tokens": 500}
)

print(f"LLM Score: {result.overall_score:.2f}")

Metric Types

Accuracy Metrics

# Calculate accuracy
accuracy = metrics._calculate_accuracy(
expected=["correct answer"],
actual=["correct answer"]
)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision
precision = metrics._calculate_precision(
expected=["positive", "negative", "positive"],
actual=["positive", "positive", "positive"]
)
print(f"Precision: {precision:.2f}")

# Calculate recall
recall = metrics._calculate_recall(
expected=["positive", "negative", "positive"],
actual=["positive", "positive", "positive"]
)
print(f"Recall: {recall:.2f}")

# Calculate F1 score
f1 = metrics._calculate_f1_score(
expected=["positive", "negative", "positive"],
actual=["positive", "positive", "positive"]
)
print(f"F1 Score: {f1:.2f}")

Performance Metrics

# Calculate latency score
latency_score = metrics._calculate_latency(execution_time=2.5)
print(f"Latency Score: {latency_score:.2f}")

# Calculate throughput
throughput_score = metrics._calculate_throughput(
items_processed=100,
time_seconds=10.0
)
print(f"Throughput Score: {throughput_score:.2f}")

# Calculate cost efficiency
cost_efficiency = metrics._calculate_cost_efficiency(
cost=0.05,
output_quality=0.9
)
print(f"Cost Efficiency: {cost_efficiency:.2f}")

Quality Metrics

# Calculate reasoning quality
reasoning_quality = metrics._calculate_reasoning_quality(
query="What is AI?",
answer="AI is artificial intelligence...",
reasoning_trace=["Step 1", "Step 2", "Step 3"]
)
print(f"Reasoning Quality: {reasoning_quality:.2f}")

# Calculate orchestration effectiveness
orchestration_score = metrics._calculate_orchestration_effectiveness(
success_rate=0.95,
node_performance={"node1": 0.9, "node2": 0.95}
)
print(f"Orchestration Score: {orchestration_score:.2f}")

# Calculate user satisfaction
satisfaction = metrics._calculate_user_satisfaction(
user_feedback=0.85,
response_time=1.5,
accuracy=0.92
)
print(f"User Satisfaction: {satisfaction:.2f}")

Benchmarking

Set Benchmarks

# Set benchmark data
metrics = EvaluationMetrics(
enable_benchmarking=True,
benchmark_data={
"reasoning_baseline": 0.85,
"orchestration_baseline": 0.90,
"tool_usage_baseline": 0.88,
"llm_performance_baseline": 0.92
}
)

Compare with Benchmarks

# Get benchmark comparison
comparison = metrics.get_benchmark_comparison(
domain=EvaluationDomain.REASONING,
current_score=0.92
)

if comparison:
print(f"Current Score: {comparison.current_score:.2f}")
print(f"Baseline Score: {comparison.baseline_score:.2f}")
print(f"Improvement: {comparison.improvement:.2f}")
print(f"Percentile: {comparison.percentile:.1f}%")

Evaluation Summary

Get Summary

# Get evaluation summary
summary = metrics.get_evaluation_summary(
domain=EvaluationDomain.REASONING,
days=30
)

print(f"Total Evaluations: {summary['total_evaluations']}")
print(f"Average Score: {summary['average_score']:.2f}")
print(f"Trend: {summary['trend']}")

# By domain
for domain, score in summary['by_domain'].items():
print(f"{domain}: {score:.2f}")

# By metric
for metric, score in summary['by_metric'].items():
print(f"{metric}: {score:.2f}")

Trend Analysis

# Analyze trends
summary = metrics.get_evaluation_summary(days=30)

if summary['trend'] == 'improving':
print("📈 Performance is improving")
elif summary['trend'] == 'declining':
print("📉 Performance is declining")
else:
print("📊 Performance is stable")

Custom Metrics

Add Custom Metric

# Add custom metric calculator
def custom_metric_calculator(expected, actual, **kwargs):
# Custom calculation logic
return custom_score

metrics.add_custom_metric(
name="custom_metric",
calculator=custom_metric_calculator,
metric_type=MetricType.CUSTOM
)

# Use custom metric
result = metrics.evaluate_reasoning(
query="test",
expected_answer="expected",
actual_answer="actual"
)

Domain-Specific Metrics

# Add domain-specific metric
def domain_specific_metric(data):
# Calculate domain-specific score
return score

metrics.add_custom_metric(
name="domain_metric",
calculator=domain_specific_metric
)

Integration Examples

With Workflows

from packages.observability import trace_workflow, get_evaluation_metrics

@trace_workflow(name="evaluated_workflow")
async def my_workflow():
metrics = get_evaluation_metrics()

# Evaluate workflow performance
result = metrics.evaluate_orchestration(
workflow_name="my_workflow",
execution_time=30.0,
success_rate=1.0,
node_performance={"node1": 0.95}
)

return result

With Tools

from packages.observability import trace_tool, get_evaluation_metrics

@trace_tool(name="evaluated_tool")
async def my_tool():
metrics = get_evaluation_metrics()

# Evaluate tool performance
result = metrics.evaluate_tool_usage(
tool_name="my_tool",
execution_time=1.5,
success=True,
input_complexity=5.0,
output_quality=0.9
)

return result

With LLM Calls

from packages.observability import trace_llm_call, get_evaluation_metrics

@trace_llm_call(model="gpt-4")
async def llm_call(prompt):
metrics = get_evaluation_metrics()

# Evaluate LLM performance
result = metrics.evaluate_llm_performance(
model="gpt-4",
prompt=prompt,
response=response,
latency=1.2,
cost=0.05
)

return result

Advanced Configuration

Custom Benchmarks

# Set custom benchmarks
metrics = EvaluationMetrics(
enable_benchmarking=True,
benchmark_data={
"reasoning_baseline": 0.85,
"orchestration_baseline": 0.90,
"tool_usage_baseline": 0.88,
"llm_performance_baseline": 0.92,
"custom_baseline": 0.80
}
)

Evaluation History

# Get evaluation history
history = metrics.evaluation_history

for evaluation in history[-10:]: # Last 10 evaluations
print(f"Domain: {evaluation.domain}")
print(f"Score: {evaluation.overall_score:.2f}")
print(f"Confidence: {evaluation.confidence:.2f}")
print(f"Timestamp: {evaluation.timestamp}")

Best Practices

  1. Regular Evaluation: Evaluate performance regularly
  2. Set Benchmarks: Establish baseline performance metrics
  3. Track Trends: Monitor performance trends over time
  4. Use Appropriate Metrics: Choose metrics relevant to your domain
  5. Custom Metrics: Add domain-specific evaluation metrics
  6. Compare Performance: Compare against benchmarks and baselines
  7. Document Results: Keep records of evaluation results

Troubleshooting

Common Issues

  1. Low Scores: Check data quality and model performance
  2. Inconsistent Results: Ensure consistent evaluation criteria
  3. Missing Metrics: Verify all required data is provided
  4. Benchmark Issues: Check benchmark data accuracy

Debug Mode

# Enable debug logging
import logging
logging.getLogger('packages.observability.evaluation_metrics').setLevel(logging.DEBUG)

API Reference

EvaluationMetrics

ParameterTypeDescription
enable_benchmarkingboolEnable benchmarking
benchmark_dataDictBaseline benchmark data

EvaluationDomain

ValueDescription
REASONINGReasoning quality evaluation
ORCHESTRATIONOrchestration effectiveness
TOOL_USAGETool usage evaluation
LLM_PERFORMANCELLM performance evaluation
WORKFLOW_EFFECTIVENESSWorkflow effectiveness
OVERALLOverall evaluation

MetricType

ValueDescription
ACCURACYAccuracy metric
PRECISIONPrecision metric
RECALLRecall metric
F1_SCOREF1 score metric
LATENCYLatency metric
THROUGHPUTThroughput metric
COST_EFFICIENCYCost efficiency metric
REASONING_QUALITYReasoning quality metric
ORCHESTRATION_EFFECTIVENESSOrchestration effectiveness
USER_SATISFACTIONUser satisfaction metric
CUSTOMCustom metric