Caching Architecture
Deep dive into the 4-layer cache architecture, intelligent routing, and performance optimization
Architecture Overview
The Caching Platform implements a sophisticated 4-layer architecture designed for maximum performance and cost efficiency:
Layer 1: Full Result Cache
Purpose
Complete query results with full context and formatting.
Characteristics
- TTL: 24 hours
- Size Limit: 1GB per cache
- Hit Rate: 40-60%
- Use Case: Identical queries, exact matches
Implementation
class FullResultCache:
def __init__(self, redis_client, ttl=86400):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "full_result:"
def get(self, query_hash):
"""Retrieve full cached result"""
key = f"{self.key_prefix}{query_hash}"
return self.redis.get(key)
def set(self, query_hash, result):
"""Store full result with metadata"""
key = f"{self.key_prefix}{query_hash}"
cache_data = {
"result": result,
"timestamp": time.time(),
"metadata": self._extract_metadata(result)
}
self.redis.setex(key, self.ttl, json.dumps(cache_data))
Cache Key Strategy
def generate_cache_key(query, context=None):
"""Generate consistent cache keys"""
# Normalize query
normalized_query = query.lower().strip()
# Include context hash if provided
context_hash = hashlib.md5(str(context).encode()).hexdigest()[:8] if context else ""
# Generate final key
query_hash = hashlib.sha256(normalized_query.encode()).hexdigest()[:16]
return f"{query_hash}:{context_hash}"
Layer 2: Retrieval Cache
Purpose
Cached document chunks and retrieval metadata for similar queries.
Characteristics
- TTL: 7 days
- Size Limit: 5GB per cache
- Hit Rate: 25-35%
- Use Case: Similar queries, document retrieval
Implementation
class RetrievalCache:
def __init__(self, redis_client, ttl=604800):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "retrieval:"
def get_retrieval_data(self, query_hash):
"""Get cached retrieval data"""
key = f"{self.key_prefix}{query_hash}"
data = self.redis.get(key)
if data:
return json.loads(data)
return None
def store_retrieval_data(self, query_hash, documents, scores, metadata):
"""Store retrieval results"""
key = f"{self.key_prefix}{query_hash}"
cache_data = {
"documents": documents,
"scores": scores,
"metadata": metadata,
"timestamp": time.time()
}
self.redis.setex(key, self.ttl, json.dumps(cache_data))
Layer 3: Summary Cache
Purpose
Generated summaries and key insights for content summarization.
Characteristics
- TTL: 3 days
- Size Limit: 2GB per cache
- Hit Rate: 15-25%
- Use Case: Summary generation, insight extraction
Implementation
class SummaryCache:
def __init__(self, redis_client, ttl=259200):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "summary:"
def get_summary(self, content_hash):
"""Retrieve cached summary"""
key = f"{self.key_prefix}{content_hash}"
return self.redis.get(key)
def store_summary(self, content_hash, summary, metadata):
"""Store summary with metadata"""
key = f"{self.key_prefix}{content_hash}"
cache_data = {
"summary": summary,
"metadata": metadata,
"timestamp": time.time()
}
self.redis.setex(key, self.ttl, json.dumps(cache_data))
Layer 4: Embedding Cache
Purpose
Vector embeddings for semantic similarity and search.
Characteristics
- TTL: 30 days
- Size Limit: 10GB per cache
- Hit Rate: 20-30%
- Use Case: Semantic search, similarity matching
Implementation
class EmbeddingCache:
def __init__(self, redis_client, ttl=2592000):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "embedding:"
def get_embedding(self, text_hash):
"""Retrieve cached embedding"""
key = f"{self.key_prefix}{text_hash}"
data = self.redis.get(key)
if data:
return np.frombuffer(data, dtype=np.float32)
return None
def store_embedding(self, text_hash, embedding):
"""Store embedding vector"""
key = f"{self.key_prefix}{text_hash}"
self.redis.setex(key, self.ttl, embedding.tobytes())
Intelligent Query Router
Query Classification
class QueryRouter:
def __init__(self):
self.complexity_analyzer = ComplexityAnalyzer()
self.intent_classifier = IntentClassifier()
def route_query(self, query, context=None):
"""Route query to appropriate cache layer"""
# Analyze query complexity
complexity = self.complexity_analyzer.analyze(query)
# Classify intent
intent = self.intent_classifier.classify(query)
# Determine routing strategy
if complexity == "simple" and intent == "factual":
return self._route_to_full_cache(query)
elif complexity == "medium" and intent == "analytical":
return self._route_to_retrieval_cache(query)
elif intent == "summarization":
return self._route_to_summary_cache(query)
else:
return self._route_to_embedding_cache(query)
Cache Selection Logic
def select_cache_strategy(self, query, context):
"""Select optimal cache strategy based on query characteristics"""
# Check for exact matches first
exact_match = self.full_cache.get(query)
if exact_match:
return {"strategy": "exact", "result": exact_match}
# Check semantic similarity
semantic_matches = self.semantic_cache.find_similar(query, threshold=0.85)
if semantic_matches:
return {"strategy": "semantic", "matches": semantic_matches}
# Route based on query type
if self._is_retrieval_query(query):
return self._check_retrieval_cache(query)
elif self._is_summary_query(query):
return self._check_summary_cache(query)
else:
return self._check_embedding_cache(query)
Performance Optimization
Cache Warming Strategies
class CacheWarmer:
def __init__(self, cache_manager):
self.cache = cache_manager
self.analytics = AnalyticsEngine()
def warm_popular_queries(self, limit=1000):
"""Warm cache with most popular queries"""
popular_queries = self.analytics.get_popular_queries(limit=limit)
for query in popular_queries:
# Pre-compute and cache results
result = self._compute_result(query)
self.cache.store(query, result)
def warm_time_based_patterns(self):
"""Warm cache based on historical time patterns"""
time_patterns = self.analytics.get_time_patterns()
for pattern in time_patterns:
if self._is_active_pattern(pattern):
self._warm_pattern_queries(pattern)
Cache Optimization
class CacheOptimizer:
def __init__(self, cache_manager):
self.cache = cache_manager
self.metrics = PerformanceMetrics()
def optimize_ttl(self):
"""Dynamically optimize TTL based on usage patterns"""
usage_stats = self.metrics.get_usage_stats()
for layer in self.cache.layers:
optimal_ttl = self._calculate_optimal_ttl(
layer, usage_stats
)
layer.update_ttl(optimal_ttl)
def optimize_cache_size(self):
"""Optimize cache size based on hit rates"""
hit_rates = self.metrics.get_hit_rates()
for layer, hit_rate in hit_rates.items():
if hit_rate < 0.3: # Low hit rate
self._reduce_cache_size(layer)
elif hit_rate > 0.8: # High hit rate
self._increase_cache_size(layer)
Monitoring and Metrics
Key Performance Indicators
class CacheMetrics:
def __init__(self):
self.metrics = {
"hit_rates": {},
"response_times": {},
"cost_savings": {},
"throughput": {}
}
def track_hit_rate(self, layer, hit, miss):
"""Track cache hit rates by layer"""
total = hit + miss
hit_rate = hit / total if total > 0 else 0
self.metrics["hit_rates"][layer] = hit_rate
def track_response_time(self, layer, response_time):
"""Track response times by layer"""
if layer not in self.metrics["response_times"]:
self.metrics["response_times"][layer] = []
self.metrics["response_times"][layer].append(response_time)
def calculate_cost_savings(self):
"""Calculate cost savings from caching"""
total_requests = sum(self.metrics["hit_rates"].values())
hit_requests = sum(
rate * total for rate in self.metrics["hit_rates"].values()
)
return hit_requests / total_requests if total_requests > 0 else 0
Real-time Monitoring
class CacheMonitor:
def __init__(self, cache_manager):
self.cache = cache_manager
self.alerts = AlertManager()
def monitor_performance(self):
"""Monitor cache performance in real-time"""
metrics = self.cache.get_metrics()
# Check hit rates
for layer, hit_rate in metrics["hit_rates"].items():
if hit_rate < 0.5: # Low hit rate alert
self.alerts.send_alert(
f"Low hit rate in {layer}: {hit_rate:.2%}"
)
# Check response times
for layer, avg_time in metrics["avg_response_times"].items():
if avg_time > 100: # Slow response alert
self.alerts.send_alert(
f"Slow response in {layer}: {avg_time}ms"
)
Configuration
Cache Configuration
CACHE_CONFIG = {
"layers": {
"L1": {
"ttl": 86400, # 24 hours
"max_size": "1GB",
"strategy": "LRU"
},
"L2": {
"ttl": 604800, # 7 days
"max_size": "5GB",
"strategy": "LFU"
},
"L3": {
"ttl": 259200, # 3 days
"max_size": "2GB",
"strategy": "LRU"
},
"L4": {
"ttl": 2592000, # 30 days
"max_size": "10GB",
"strategy": "FIFO"
}
},
"redis": {
"host": "redis-cluster.internal",
"port": 6379,
"password": "secure_password",
"max_connections": 100,
"retry_on_timeout": True
},
"semantic": {
"threshold": 0.85,
"algorithm": "cosine_similarity",
"embedding_model": "text-embedding-ada-002"
}
}
Best Practices
Cache Key Design
- Consistent Hashing: Use consistent hash functions for keys
- Namespace Separation: Use prefixes to separate different cache types
- TTL Management: Set appropriate TTLs based on data freshness requirements
- Key Versioning: Include version information in cache keys
Performance Optimization
- Batch Operations: Use batch operations for multiple cache operations
- Connection Pooling: Maintain connection pools for Redis connections
- Compression: Compress large cache values to save memory
- Monitoring: Continuously monitor cache performance and optimize
Error Handling
- Graceful Degradation: Continue operation even if cache fails
- Fallback Strategies: Implement fallback mechanisms for cache misses
- Circuit Breakers: Use circuit breakers to prevent cache overload
- Retry Logic: Implement exponential backoff for failed operations