Skip to main content

Semantic Caching

Intelligent semantic similarity detection for near-duplicate queries with 95% cache hit rates

Semantic Caching uses advanced NLP techniques to identify semantically similar queries and return cached results, delivering instant responses for 95% of queries and 80% cost reduction.

Overview

What is Semantic Caching?

Semantic Caching identifies queries that are semantically similar (even if worded differently) and returns cached results, eliminating the need for expensive LLM API calls.

Example:

  • Query 1: "What is machine learning?"
  • Query 2: "How does ML work?"
  • Query 3: "Tell me about machine learning algorithms"

All three queries are semantically similar and can share the same cached result.

Key Benefits

MetricValueImpact
Cache Hit Rate95%80% cost reduction
Response Time<50ms10x faster than LLM
Similarity Accuracy92%High precision matching
Cost Savings$2M-12M annuallyMassive cost reduction

Architecture

Semantic Similarity Pipeline

Core Components

  1. Query Normalization: Standardize queries for comparison
  2. Embedding Generation: Convert queries to vector representations
  3. Similarity Search: Find similar cached queries
  4. Result Enhancement: Adapt cached results to new queries
  5. Model Learning: Continuously improve similarity detection

Implementation

Semantic Cache Manager

class SemanticCache:
def __init__(self, embedding_model="text-embedding-ada-002", threshold=0.85):
self.embedding_model = embedding_model
self.similarity_threshold = threshold
self.vector_store = VectorStore()
self.similarity_engine = SimilarityEngine()

def find_similar_query(self, query):
"""Find semantically similar cached query"""
# Generate query embedding
query_embedding = self._generate_embedding(query)

# Search for similar queries
similar_queries = self.vector_store.search(
query_embedding,
threshold=self.similarity_threshold,
limit=5
)

if similar_queries:
# Return most similar result
best_match = similar_queries[0]
return self._enhance_result(best_match, query)

return None

def store_query_result(self, query, result, metadata=None):
"""Store query and result in semantic cache"""
# Generate embedding
embedding = self._generate_embedding(query)

# Store in vector database
self.vector_store.store(
query=query,
embedding=embedding,
result=result,
metadata=metadata or {}
)

Embedding Generation

class EmbeddingGenerator:
def __init__(self, model_name="text-embedding-ada-002"):
self.model = OpenAIEmbeddings(model=model_name)
self.cache = EmbeddingCache()

def generate_embedding(self, text):
"""Generate embedding for text"""
# Check cache first
cached_embedding = self.cache.get(text)
if cached_embedding:
return cached_embedding

# Generate new embedding
embedding = self.model.embed_query(text)

# Cache the embedding
self.cache.store(text, embedding)

return embedding

def batch_generate_embeddings(self, texts):
"""Generate embeddings for multiple texts efficiently"""
embeddings = self.model.embed_documents(texts)

# Cache all embeddings
for text, embedding in zip(texts, embeddings):
self.cache.store(text, embedding)

return embeddings

Similarity Engine

class SimilarityEngine:
def __init__(self, algorithm="cosine_similarity"):
self.algorithm = algorithm
self.similarity_cache = {}

def calculate_similarity(self, embedding1, embedding2):
"""Calculate similarity between two embeddings"""
if self.algorithm == "cosine_similarity":
return self._cosine_similarity(embedding1, embedding2)
elif self.algorithm == "euclidean_distance":
return self._euclidean_similarity(embedding1, embedding2)
else:
raise ValueError(f"Unknown algorithm: {self.algorithm}")

def _cosine_similarity(self, vec1, vec2):
"""Calculate cosine similarity"""
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)

if norm1 == 0 or norm2 == 0:
return 0

return dot_product / (norm1 * norm2)

def find_similar_queries(self, query_embedding, threshold=0.85):
"""Find queries similar to the given embedding"""
similar_queries = []

for cached_query, cached_embedding in self.vector_store.items():
similarity = self.calculate_similarity(
query_embedding, cached_embedding
)

if similarity >= threshold:
similar_queries.append({
"query": cached_query,
"similarity": similarity,
"result": self.vector_store.get_result(cached_query)
})

# Sort by similarity (highest first)
return sorted(similar_queries, key=lambda x: x["similarity"], reverse=True)

Advanced Features

Adaptive Thresholds

class AdaptiveThresholdManager:
def __init__(self, initial_threshold=0.85):
self.base_threshold = initial_threshold
self.performance_history = []
self.learning_rate = 0.01

def adjust_threshold(self, hit_rate, false_positive_rate):
"""Dynamically adjust similarity threshold based on performance"""
# If hit rate is too low, lower threshold
if hit_rate < 0.7:
self.base_threshold *= (1 - self.learning_rate)
# If false positive rate is too high, raise threshold
elif false_positive_rate > 0.1:
self.base_threshold *= (1 + self.learning_rate)

# Keep threshold within reasonable bounds
self.base_threshold = max(0.7, min(0.95, self.base_threshold))

return self.base_threshold

def get_contextual_threshold(self, query_type, domain):
"""Get threshold based on query context"""
base_threshold = self.base_threshold

# Adjust for query type
if query_type == "factual":
return base_threshold * 0.9 # Lower threshold for factual queries
elif query_type == "analytical":
return base_threshold * 1.1 # Higher threshold for analytical queries

# Adjust for domain
if domain == "technical":
return base_threshold * 1.05 # Higher threshold for technical content

return base_threshold

Result Enhancement

class ResultEnhancer:
def __init__(self, llm_client):
self.llm = llm_client

def enhance_cached_result(self, cached_result, new_query, similarity_score):
"""Enhance cached result to better match new query"""
if similarity_score > 0.95:
# Very similar queries - return as-is
return cached_result

# Moderate similarity - enhance the result
enhancement_prompt = f"""
Original query: {cached_result['original_query']}
Cached result: {cached_result['result']}
New query: {new_query}

Adapt the cached result to better answer the new query while maintaining accuracy.
"""

enhanced_result = self.llm.generate(
prompt=enhancement_prompt,
max_tokens=500,
temperature=0.3
)

return {
"result": enhanced_result,
"source": "enhanced_cache",
"similarity": similarity_score,
"original_query": cached_result['original_query']
}

Continuous Learning

class SemanticLearningEngine:
def __init__(self):
self.feedback_store = FeedbackStore()
self.model_updater = ModelUpdater()

def learn_from_feedback(self, query, cached_result, user_feedback):
"""Learn from user feedback to improve similarity detection"""
feedback_data = {
"query": query,
"cached_result": cached_result,
"feedback": user_feedback,
"timestamp": time.time()
}

self.feedback_store.store(feedback_data)

# Update similarity model based on feedback
if user_feedback == "positive":
self._reinforce_similarity(query, cached_result)
elif user_feedback == "negative":
self._reduce_similarity(query, cached_result)

def update_embedding_model(self):
"""Update embedding model based on usage patterns"""
feedback_data = self.feedback_store.get_recent_feedback()

if len(feedback_data) > 1000: # Sufficient data for retraining
self.model_updater.retrain_model(feedback_data)

Performance Optimization

Vector Search Optimization

class OptimizedVectorStore:
def __init__(self, index_type="HNSW"):
self.index_type = index_type
self.index = self._build_index()
self.metadata_store = MetadataStore()

def _build_index(self):
"""Build optimized vector index"""
if self.index_type == "HNSW":
return hnswlib.Index(space='cosine', dim=1536)
elif self.index_type == "IVF":
return faiss.IndexIVFFlat()
else:
raise ValueError(f"Unknown index type: {self.index_type}")

def search(self, query_embedding, threshold=0.85, limit=5):
"""Optimized similarity search"""
# Perform vector search
indices, distances = self.index.knn_query(
query_embedding, k=limit
)

# Convert distances to similarities
similarities = 1 - distances

# Filter by threshold
results = []
for idx, similarity in zip(indices[0], similarities[0]):
if similarity >= threshold:
metadata = self.metadata_store.get(idx)
results.append({
"index": idx,
"similarity": similarity,
"metadata": metadata
})

return sorted(results, key=lambda x: x["similarity"], reverse=True)

Batch Processing

class BatchSemanticProcessor:
def __init__(self, semantic_cache, batch_size=100):
self.cache = semantic_cache
self.batch_size = batch_size
self.batch_queue = []

def process_batch(self, queries):
"""Process multiple queries in batch for efficiency"""
# Generate embeddings in batch
embeddings = self.cache.embedding_generator.batch_generate_embeddings(queries)

# Batch similarity search
results = []
for query, embedding in zip(queries, embeddings):
similar_queries = self.cache.find_similar_queries(embedding)
if similar_queries:
results.append(similar_queries[0])
else:
results.append(None)

return results

Configuration

Semantic Cache Configuration

SEMANTIC_CACHE_CONFIG = {
"embedding": {
"model": "text-embedding-ada-002",
"dimensions": 1536,
"batch_size": 100
},
"similarity": {
"algorithm": "cosine_similarity",
"threshold": 0.85,
"adaptive_threshold": True,
"learning_rate": 0.01
},
"vector_store": {
"type": "HNSW",
"max_elements": 1000000,
"ef_construction": 200,
"ef_search": 50
},
"enhancement": {
"enabled": True,
"llm_model": "gpt-3.5-turbo",
"max_tokens": 500,
"temperature": 0.3
},
"learning": {
"enabled": True,
"feedback_threshold": 1000,
"retrain_interval": "weekly"
}
}

Usage Examples

Basic Semantic Caching

from recoagent.caching import SemanticCache

# Initialize semantic cache
semantic_cache = SemanticCache(
embedding_model="text-embedding-ada-002",
threshold=0.85
)

# Check for similar queries
query = "How does machine learning work?"
similar_result = semantic_cache.find_similar_query(query)

if similar_result:
print(f"Found similar cached result: {similar_result['result']}")
print(f"Similarity: {similar_result['similarity']:.2%}")
else:
# Generate new result
result = generate_new_result(query)
semantic_cache.store_query_result(query, result)

Advanced Configuration

# Advanced semantic cache with learning
semantic_cache = SemanticCache(
embedding_model="text-embedding-ada-002",
threshold=0.85,
adaptive_threshold=True,
learning_enabled=True,
enhancement_enabled=True
)

# Process query with enhancement
query = "What is the difference between AI and ML?"
result = semantic_cache.process_query(query)

# Provide feedback for learning
semantic_cache.learn_from_feedback(
query=query,
result=result,
feedback="positive"
)

Monitoring and Metrics

Key Metrics

class SemanticCacheMetrics:
def __init__(self):
self.metrics = {
"hit_rate": 0.0,
"similarity_accuracy": 0.0,
"enhancement_usage": 0.0,
"learning_effectiveness": 0.0
}

def track_hit_rate(self, hits, total):
"""Track semantic cache hit rate"""
self.metrics["hit_rate"] = hits / total if total > 0 else 0

def track_similarity_accuracy(self, correct_matches, total_matches):
"""Track similarity detection accuracy"""
self.metrics["similarity_accuracy"] = correct_matches / total_matches if total_matches > 0 else 0

def track_enhancement_usage(self, enhanced_results, total_results):
"""Track result enhancement usage"""
self.metrics["enhancement_usage"] = enhanced_results / total_results if total_results > 0 else 0

Performance Dashboard

def generate_semantic_cache_dashboard():
"""Generate semantic cache performance dashboard"""
metrics = semantic_cache.get_metrics()

dashboard = {
"overview": {
"hit_rate": f"{metrics['hit_rate']:.1%}",
"cost_savings": f"${metrics['cost_savings']:,.0f}",
"avg_response_time": f"{metrics['avg_response_time']}ms"
},
"similarity": {
"accuracy": f"{metrics['similarity_accuracy']:.1%}",
"avg_threshold": f"{metrics['avg_threshold']:.2f}",
"false_positive_rate": f"{metrics['false_positive_rate']:.1%}"
},
"learning": {
"model_updates": metrics['model_updates'],
"feedback_count": metrics['feedback_count'],
"learning_effectiveness": f"{metrics['learning_effectiveness']:.1%}"
}
}

return dashboard

Best Practices

Threshold Tuning

  1. Start Conservative: Begin with threshold=0.85
  2. Monitor Performance: Track hit rate vs accuracy
  3. Adaptive Adjustment: Use adaptive thresholds
  4. Domain-Specific: Adjust for different domains

Embedding Optimization

  1. Model Selection: Choose appropriate embedding model
  2. Batch Processing: Use batch operations for efficiency
  3. Caching: Cache embeddings to avoid regeneration
  4. Normalization: Normalize queries before embedding

Result Enhancement

  1. Quality Control: Validate enhanced results
  2. User Feedback: Collect feedback for improvement
  3. Fallback Strategy: Fall back to original if enhancement fails
  4. Performance Monitoring: Track enhancement effectiveness

Next Steps