Skip to main content

Caching Architecture

Deep dive into the 4-layer cache architecture, intelligent routing, and performance optimization

Architecture Overview

The Caching Platform implements a sophisticated 4-layer architecture designed for maximum performance and cost efficiency:

Layer 1: Full Result Cache

Purpose

Complete query results with full context and formatting.

Characteristics

  • TTL: 24 hours
  • Size Limit: 1GB per cache
  • Hit Rate: 40-60%
  • Use Case: Identical queries, exact matches

Implementation

class FullResultCache:
def __init__(self, redis_client, ttl=86400):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "full_result:"

def get(self, query_hash):
"""Retrieve full cached result"""
key = f"{self.key_prefix}{query_hash}"
return self.redis.get(key)

def set(self, query_hash, result):
"""Store full result with metadata"""
key = f"{self.key_prefix}{query_hash}"
cache_data = {
"result": result,
"timestamp": time.time(),
"metadata": self._extract_metadata(result)
}
self.redis.setex(key, self.ttl, json.dumps(cache_data))

Cache Key Strategy

def generate_cache_key(query, context=None):
"""Generate consistent cache keys"""
# Normalize query
normalized_query = query.lower().strip()

# Include context hash if provided
context_hash = hashlib.md5(str(context).encode()).hexdigest()[:8] if context else ""

# Generate final key
query_hash = hashlib.sha256(normalized_query.encode()).hexdigest()[:16]
return f"{query_hash}:{context_hash}"

Layer 2: Retrieval Cache

Purpose

Cached document chunks and retrieval metadata for similar queries.

Characteristics

  • TTL: 7 days
  • Size Limit: 5GB per cache
  • Hit Rate: 25-35%
  • Use Case: Similar queries, document retrieval

Implementation

class RetrievalCache:
def __init__(self, redis_client, ttl=604800):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "retrieval:"

def get_retrieval_data(self, query_hash):
"""Get cached retrieval data"""
key = f"{self.key_prefix}{query_hash}"
data = self.redis.get(key)
if data:
return json.loads(data)
return None

def store_retrieval_data(self, query_hash, documents, scores, metadata):
"""Store retrieval results"""
key = f"{self.key_prefix}{query_hash}"
cache_data = {
"documents": documents,
"scores": scores,
"metadata": metadata,
"timestamp": time.time()
}
self.redis.setex(key, self.ttl, json.dumps(cache_data))

Layer 3: Summary Cache

Purpose

Generated summaries and key insights for content summarization.

Characteristics

  • TTL: 3 days
  • Size Limit: 2GB per cache
  • Hit Rate: 15-25%
  • Use Case: Summary generation, insight extraction

Implementation

class SummaryCache:
def __init__(self, redis_client, ttl=259200):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "summary:"

def get_summary(self, content_hash):
"""Retrieve cached summary"""
key = f"{self.key_prefix}{content_hash}"
return self.redis.get(key)

def store_summary(self, content_hash, summary, metadata):
"""Store summary with metadata"""
key = f"{self.key_prefix}{content_hash}"
cache_data = {
"summary": summary,
"metadata": metadata,
"timestamp": time.time()
}
self.redis.setex(key, self.ttl, json.dumps(cache_data))

Layer 4: Embedding Cache

Purpose

Vector embeddings for semantic similarity and search.

Characteristics

  • TTL: 30 days
  • Size Limit: 10GB per cache
  • Hit Rate: 20-30%
  • Use Case: Semantic search, similarity matching

Implementation

class EmbeddingCache:
def __init__(self, redis_client, ttl=2592000):
self.redis = redis_client
self.ttl = ttl
self.key_prefix = "embedding:"

def get_embedding(self, text_hash):
"""Retrieve cached embedding"""
key = f"{self.key_prefix}{text_hash}"
data = self.redis.get(key)
if data:
return np.frombuffer(data, dtype=np.float32)
return None

def store_embedding(self, text_hash, embedding):
"""Store embedding vector"""
key = f"{self.key_prefix}{text_hash}"
self.redis.setex(key, self.ttl, embedding.tobytes())

Intelligent Query Router

Query Classification

class QueryRouter:
def __init__(self):
self.complexity_analyzer = ComplexityAnalyzer()
self.intent_classifier = IntentClassifier()

def route_query(self, query, context=None):
"""Route query to appropriate cache layer"""
# Analyze query complexity
complexity = self.complexity_analyzer.analyze(query)

# Classify intent
intent = self.intent_classifier.classify(query)

# Determine routing strategy
if complexity == "simple" and intent == "factual":
return self._route_to_full_cache(query)
elif complexity == "medium" and intent == "analytical":
return self._route_to_retrieval_cache(query)
elif intent == "summarization":
return self._route_to_summary_cache(query)
else:
return self._route_to_embedding_cache(query)

Cache Selection Logic

def select_cache_strategy(self, query, context):
"""Select optimal cache strategy based on query characteristics"""

# Check for exact matches first
exact_match = self.full_cache.get(query)
if exact_match:
return {"strategy": "exact", "result": exact_match}

# Check semantic similarity
semantic_matches = self.semantic_cache.find_similar(query, threshold=0.85)
if semantic_matches:
return {"strategy": "semantic", "matches": semantic_matches}

# Route based on query type
if self._is_retrieval_query(query):
return self._check_retrieval_cache(query)
elif self._is_summary_query(query):
return self._check_summary_cache(query)
else:
return self._check_embedding_cache(query)

Performance Optimization

Cache Warming Strategies

class CacheWarmer:
def __init__(self, cache_manager):
self.cache = cache_manager
self.analytics = AnalyticsEngine()

def warm_popular_queries(self, limit=1000):
"""Warm cache with most popular queries"""
popular_queries = self.analytics.get_popular_queries(limit=limit)

for query in popular_queries:
# Pre-compute and cache results
result = self._compute_result(query)
self.cache.store(query, result)

def warm_time_based_patterns(self):
"""Warm cache based on historical time patterns"""
time_patterns = self.analytics.get_time_patterns()

for pattern in time_patterns:
if self._is_active_pattern(pattern):
self._warm_pattern_queries(pattern)

Cache Optimization

class CacheOptimizer:
def __init__(self, cache_manager):
self.cache = cache_manager
self.metrics = PerformanceMetrics()

def optimize_ttl(self):
"""Dynamically optimize TTL based on usage patterns"""
usage_stats = self.metrics.get_usage_stats()

for layer in self.cache.layers:
optimal_ttl = self._calculate_optimal_ttl(
layer, usage_stats
)
layer.update_ttl(optimal_ttl)

def optimize_cache_size(self):
"""Optimize cache size based on hit rates"""
hit_rates = self.metrics.get_hit_rates()

for layer, hit_rate in hit_rates.items():
if hit_rate < 0.3: # Low hit rate
self._reduce_cache_size(layer)
elif hit_rate > 0.8: # High hit rate
self._increase_cache_size(layer)

Monitoring and Metrics

Key Performance Indicators

class CacheMetrics:
def __init__(self):
self.metrics = {
"hit_rates": {},
"response_times": {},
"cost_savings": {},
"throughput": {}
}

def track_hit_rate(self, layer, hit, miss):
"""Track cache hit rates by layer"""
total = hit + miss
hit_rate = hit / total if total > 0 else 0
self.metrics["hit_rates"][layer] = hit_rate

def track_response_time(self, layer, response_time):
"""Track response times by layer"""
if layer not in self.metrics["response_times"]:
self.metrics["response_times"][layer] = []
self.metrics["response_times"][layer].append(response_time)

def calculate_cost_savings(self):
"""Calculate cost savings from caching"""
total_requests = sum(self.metrics["hit_rates"].values())
hit_requests = sum(
rate * total for rate in self.metrics["hit_rates"].values()
)
return hit_requests / total_requests if total_requests > 0 else 0

Real-time Monitoring

class CacheMonitor:
def __init__(self, cache_manager):
self.cache = cache_manager
self.alerts = AlertManager()

def monitor_performance(self):
"""Monitor cache performance in real-time"""
metrics = self.cache.get_metrics()

# Check hit rates
for layer, hit_rate in metrics["hit_rates"].items():
if hit_rate < 0.5: # Low hit rate alert
self.alerts.send_alert(
f"Low hit rate in {layer}: {hit_rate:.2%}"
)

# Check response times
for layer, avg_time in metrics["avg_response_times"].items():
if avg_time > 100: # Slow response alert
self.alerts.send_alert(
f"Slow response in {layer}: {avg_time}ms"
)

Configuration

Cache Configuration

CACHE_CONFIG = {
"layers": {
"L1": {
"ttl": 86400, # 24 hours
"max_size": "1GB",
"strategy": "LRU"
},
"L2": {
"ttl": 604800, # 7 days
"max_size": "5GB",
"strategy": "LFU"
},
"L3": {
"ttl": 259200, # 3 days
"max_size": "2GB",
"strategy": "LRU"
},
"L4": {
"ttl": 2592000, # 30 days
"max_size": "10GB",
"strategy": "FIFO"
}
},
"redis": {
"host": "redis-cluster.internal",
"port": 6379,
"password": "secure_password",
"max_connections": 100,
"retry_on_timeout": True
},
"semantic": {
"threshold": 0.85,
"algorithm": "cosine_similarity",
"embedding_model": "text-embedding-ada-002"
}
}

Best Practices

Cache Key Design

  1. Consistent Hashing: Use consistent hash functions for keys
  2. Namespace Separation: Use prefixes to separate different cache types
  3. TTL Management: Set appropriate TTLs based on data freshness requirements
  4. Key Versioning: Include version information in cache keys

Performance Optimization

  1. Batch Operations: Use batch operations for multiple cache operations
  2. Connection Pooling: Maintain connection pools for Redis connections
  3. Compression: Compress large cache values to save memory
  4. Monitoring: Continuously monitor cache performance and optimize

Error Handling

  1. Graceful Degradation: Continue operation even if cache fails
  2. Fallback Strategies: Implement fallback mechanisms for cache misses
  3. Circuit Breakers: Use circuit breakers to prevent cache overload
  4. Retry Logic: Implement exponential backoff for failed operations

Next Steps