MongoDB Hybrid Search
This example demonstrates how to perform hybrid search operations that combine MongoDB's text search with vector similarity search for improved results.
Prerequisites
- MongoDB Atlas cluster with Vector Search enabled
- Text index created for hybrid search
- Python 3.8+
- RecoAgent installed
Basic Hybrid Search Setup
from packages.rag.stores import MongoDBAtlasVectorStore, VectorDocument
from packages.rag.mongodb_retrievers import MongoDBHybridRetriever, MongoDBHybridConfig
# Initialize vector store
vector_store = MongoDBAtlasVectorStore(
uri="mongodb+srv://username:password@cluster.mongodb.net/",
database="recoagent",
collection="documents",
vector_search_index="vector_index"
)
# Configure hybrid search
config = MongoDBHybridConfig(
text_weight=0.3, # 30% weight for text search
vector_weight=0.7, # 70% weight for vector search
vector_k=20, # Get 20 vector results
text_k=20, # Get 20 text results
final_k=10 # Return top 10 final results
)
# Initialize hybrid retriever
retriever = MongoDBHybridRetriever(vector_store, config)
Create Text Index
# Create text index for hybrid search
# This enables MongoDB's native text search capabilities
text_fields = ['content', 'title', 'description']
retriever.create_text_index(text_fields)
print("✅ Text index created for hybrid search")
Prepare Sample Data
# Create sample documents with rich text content
documents = [
VectorDocument(
id="hybrid_doc1",
content="Machine learning algorithms can automatically improve their performance through experience. These algorithms build mathematical models based on training data to make predictions or decisions.",
embedding=[0.1, 0.2, 0.3, ...], # Your embedding
metadata={
"title": "Introduction to Machine Learning",
"category": "AI",
"difficulty": "beginner",
"tags": ["machine learning", "algorithms", "training"]
}
),
VectorDocument(
id="hybrid_doc2",
content="Neural networks are computing systems inspired by biological neural networks. They consist of interconnected nodes that process information using a connectionist approach.",
embedding=[0.2, 0.3, 0.4, ...], # Your embedding
metadata={
"title": "Understanding Neural Networks",
"category": "AI",
"difficulty": "intermediate",
"tags": ["neural networks", "computing", "biology"]
}
),
VectorDocument(
id="hybrid_doc3",
content="Deep learning uses artificial neural networks with multiple layers to model and understand complex patterns in data. It has revolutionized fields like computer vision and natural language processing.",
embedding=[0.3, 0.4, 0.5, ...], # Your embedding
metadata={
"title": "Deep Learning Fundamentals",
"category": "AI",
"difficulty": "advanced",
"tags": ["deep learning", "neural networks", "computer vision", "NLP"]
}
),
VectorDocument(
id="hybrid_doc4",
content="Natural language processing combines computational linguistics with machine learning to help computers understand, interpret, and manipulate human language.",
embedding=[0.4, 0.5, 0.6, ...], # Your embedding
metadata={
"title": "Natural Language Processing Guide",
"category": "NLP",
"difficulty": "intermediate",
"tags": ["NLP", "linguistics", "machine learning", "language"]
}
)
]
# Add documents to MongoDB
success = vector_store.add_documents(documents)
print(f"✅ Documents added: {success}")
Basic Hybrid Search
# Perform hybrid search
query = "machine learning algorithms and neural networks"
results = retriever.retrieve(query, k=5)
print(f"Hybrid search results for: '{query}'")
print(f"Found {len(results)} results:")
print()
for i, result in enumerate(results, 1):
print(f"{i}. Score: {result.score:.3f} (Hybrid)")
print(f" Title: {result.chunk.metadata.get('title', 'N/A')}")
print(f" Content: {result.chunk.content[:150]}...")
print(f" Category: {result.chunk.metadata.get('category', 'N/A')}")
print(f" Difficulty: {result.chunk.metadata.get('difficulty', 'N/A')}")
print()
Hybrid Search with Different Weights
# Test different weight configurations
weight_configs = [
{"text_weight": 0.1, "vector_weight": 0.9, "name": "Vector-heavy"},
{"text_weight": 0.5, "vector_weight": 0.5, "name": "Balanced"},
{"text_weight": 0.9, "vector_weight": 0.1, "name": "Text-heavy"}
]
query = "deep learning neural networks"
for config in weight_configs:
# Create retriever with specific weights
hybrid_config = MongoDBHybridConfig(
text_weight=config["text_weight"],
vector_weight=config["vector_weight"]
)
test_retriever = MongoDBHybridRetriever(vector_store, hybrid_config)
results = test_retriever.retrieve(query, k=3)
print(f"\n{config['name']} Search (Text: {config['text_weight']}, Vector: {config['vector_weight']})")
print(f"Top result: {results[0].chunk.metadata.get('title', 'N/A')} (Score: {results[0].score:.3f})")
Hybrid Search with Metadata Filtering
# Hybrid search with metadata filters
filter_metadata = {
"category": "AI",
"difficulty": {"operator": "$in", "value": ["beginner", "intermediate"]}
}
results = retriever.retrieve(
query="machine learning and artificial intelligence",
k=5,
filter_metadata=filter_metadata
)
print(f"Filtered hybrid search results:")
print(f"Found {len(results)} results matching filters:")
print()
for result in results:
print(f"• {result.chunk.metadata.get('title', 'N/A')}")
print(f" Score: {result.score:.3f}")
print(f" Difficulty: {result.chunk.metadata.get('difficulty', 'N/A')}")
print(f" Content: {result.chunk.content[:100]}...")
print()
Async Hybrid Search
import asyncio
async def async_hybrid_search_example():
"""Demonstrate async hybrid search."""
queries = [
"machine learning algorithms",
"neural network architectures",
"deep learning applications",
"natural language processing"
]
print("Running async hybrid searches...")
# Run multiple searches concurrently
tasks = []
for query in queries:
task = retriever.retrieve_async(query, k=3)
tasks.append((query, task))
# Wait for all searches to complete
results_list = []
for query, task in tasks:
results = await task
results_list.append((query, results))
print(f"✅ '{query}': {len(results)} results")
return results_list
# Run async example
results_list = asyncio.run(async_hybrid_search_example())
# Display results
for query, results in results_list:
print(f"\nQuery: '{query}'")
for result in results[:2]: # Show top 2 results
print(f" {result.chunk.metadata.get('title', 'N/A')} (Score: {result.score:.3f})")
Advanced Hybrid Search
from packages.rag.mongodb_retrievers import MongoDBAdvancedRetriever
# Use advanced retriever for more control
advanced_retriever = MongoDBAdvancedRetriever(vector_store, config)
# Compare different search strategies
query = "artificial intelligence and machine learning"
print("Comparing search strategies:")
print("=" * 50)
# Vector search only
vector_results = advanced_retriever.retrieve(query, k=3, search_type="vector")
print(f"\nVector Search ({len(vector_results)} results):")
for result in vector_results:
print(f" {result.chunk.metadata.get('title', 'N/A')} (Score: {result.score:.3f})")
# Hybrid search
hybrid_results = advanced_retriever.retrieve(query, k=3, search_type="hybrid")
print(f"\nHybrid Search ({len(hybrid_results)} results):")
for result in hybrid_results:
print(f" {result.chunk.metadata.get('title', 'N/A')} (Score: {result.score:.3f})")
# Faceted search
faceted_results = advanced_retriever.retrieve(
query, k=3, search_type="faceted",
facets=['category', 'difficulty']
)
print(f"\nFaceted Search ({len(faceted_results)} results):")
for result in faceted_results:
print(f" {result.chunk.metadata.get('title', 'N/A')} (Score: {result.score:.3f})")
Performance Comparison
import time
def compare_search_performance():
"""Compare performance of different search methods."""
query = "machine learning neural networks deep learning"
k = 10
# Test vector search
start_time = time.time()
vector_results = retriever.vector_retriever.retrieve(query, k=k)
vector_time = time.time() - start_time
# Test hybrid search
start_time = time.time()
hybrid_results = retriever.retrieve(query, k=k)
hybrid_time = time.time() - start_time
print("Performance Comparison:")
print(f"Vector Search: {vector_time:.3f}s ({len(vector_results)} results)")
print(f"Hybrid Search: {hybrid_time:.3f}s ({len(hybrid_results)} results)")
print(f"Overhead: {((hybrid_time - vector_time) / vector_time * 100):.1f}%")
return {
"vector_time": vector_time,
"hybrid_time": hybrid_time,
"vector_results": len(vector_results),
"hybrid_results": len(hybrid_results)
}
# Run performance comparison
performance = compare_search_performance()
Error Handling and Resilience
def resilient_hybrid_search(query, k=5, max_retries=3):
"""Hybrid search with error handling and fallbacks."""
for attempt in range(max_retries):
try:
# Try hybrid search first
results = retriever.retrieve(query, k=k)
print(f"✅ Hybrid search successful (attempt {attempt + 1})")
return results
except Exception as e:
print(f"❌ Hybrid search failed (attempt {attempt + 1}): {e}")
if attempt < max_retries - 1:
# Wait before retry
time.sleep(1)
else:
# Fallback to vector search only
print("🔄 Falling back to vector search...")
try:
fallback_results = retriever.vector_retriever.retrieve(query, k=k)
print("✅ Fallback successful")
return fallback_results
except Exception as fallback_error:
print(f"❌ Fallback also failed: {fallback_error}")
return []
return []
# Test resilient search
results = resilient_hybrid_search("machine learning algorithms", k=5)
print(f"Resilient search returned {len(results)} results")
Complete Hybrid Search Example
def complete_hybrid_example():
"""Complete hybrid search example with all features."""
# Initialize
vector_store = MongoDBAtlasVectorStore(
uri="mongodb+srv://username:password@cluster.mongodb.net/",
database="recoagent",
collection="documents",
vector_search_index="vector_index"
)
config = MongoDBHybridConfig(
text_weight=0.3,
vector_weight=0.7
)
retriever = MongoDBHybridRetriever(vector_store, config)
try:
# Create text index
retriever.create_text_index(['content', 'title'])
print("✅ Text index created")
# Add sample documents
documents = [
VectorDocument(
id="complete_doc1",
content="Machine learning is revolutionizing how we approach data analysis and pattern recognition in artificial intelligence systems.",
embedding=[0.1] * 384,
metadata={"title": "ML Revolution", "category": "AI"}
),
VectorDocument(
id="complete_doc2",
content="Neural networks provide powerful tools for understanding complex relationships in data through interconnected processing units.",
embedding=[0.2] * 384,
metadata={"title": "Neural Networks", "category": "AI"}
)
]
vector_store.add_documents(documents)
print("✅ Documents added")
# Perform hybrid search
results = retriever.retrieve("machine learning neural networks", k=5)
print(f"✅ Hybrid search completed: {len(results)} results")
# Display results
for i, result in enumerate(results, 1):
print(f"{i}. {result.chunk.metadata.get('title', 'N/A')} (Score: {result.score:.3f})")
return results
except Exception as e:
print(f"❌ Error: {e}")
return []
finally:
# Cleanup
vector_store.delete_documents(["complete_doc1", "complete_doc2"])
vector_store.close()
# Run complete example
results = complete_hybrid_example()
This hybrid search example demonstrates how to combine MongoDB's text search capabilities with vector similarity search for improved search results. The hybrid approach often provides better relevance and recall compared to using either method alone.