Skip to main content

MongoDB Faceted Search

This example demonstrates how to perform faceted search operations with MongoDB Atlas Vector Search, enabling users to filter and explore results by metadata facets.

Prerequisites

  • MongoDB Atlas cluster with Vector Search enabled
  • Documents with rich metadata
  • Python 3.8+
  • RecoAgent installed

Basic Faceted Search Setup

from packages.rag.stores import MongoDBAtlasVectorStore, VectorDocument
from packages.rag.mongodb_retrievers import MongoDBFacetedRetriever

# Initialize vector store
vector_store = MongoDBAtlasVectorStore(
uri="mongodb+srv://username:password@cluster.mongodb.net/",
database="recoagent",
collection="documents",
vector_search_index="vector_index"
)

# Initialize faceted retriever
retriever = MongoDBFacetedRetriever(vector_store)

Prepare Rich Metadata Sample Data

# Create sample documents with diverse metadata for faceting
documents = [
VectorDocument(
id="facet_doc1",
content="Machine learning algorithms can automatically improve their performance through experience and training data.",
embedding=[0.1, 0.2, 0.3, ...], # Your embedding
metadata={
"title": "Introduction to Machine Learning",
"category": "AI",
"subcategory": "Machine Learning",
"difficulty": "beginner",
"year": 2023,
"author": "Dr. Smith",
"language": "English",
"tags": ["algorithms", "training", "performance"],
"rating": 4.5,
"read_time": 15
}
),
VectorDocument(
id="facet_doc2",
content="Neural networks are computing systems inspired by biological neural networks, consisting of interconnected nodes.",
embedding=[0.2, 0.3, 0.4, ...], # Your embedding
metadata={
"title": "Understanding Neural Networks",
"category": "AI",
"subcategory": "Neural Networks",
"difficulty": "intermediate",
"year": 2023,
"author": "Prof. Johnson",
"language": "English",
"tags": ["neural networks", "computing", "biology"],
"rating": 4.8,
"read_time": 25
}
),
VectorDocument(
id="facet_doc3",
content="Deep learning uses artificial neural networks with multiple layers to model complex patterns in data.",
embedding=[0.3, 0.4, 0.5, ...], # Your embedding
metadata={
"title": "Deep Learning Fundamentals",
"category": "AI",
"subcategory": "Deep Learning",
"difficulty": "advanced",
"year": 2024,
"author": "Dr. Williams",
"language": "English",
"tags": ["deep learning", "neural networks", "patterns"],
"rating": 4.9,
"read_time": 35
}
),
VectorDocument(
id="facet_doc4",
content="Natural language processing combines computational linguistics with machine learning for language understanding.",
embedding=[0.4, 0.5, 0.6, ...], # Your embedding
metadata={
"title": "NLP Guide",
"category": "NLP",
"subcategory": "Language Processing",
"difficulty": "intermediate",
"year": 2024,
"author": "Dr. Brown",
"language": "English",
"tags": ["NLP", "linguistics", "language"],
"rating": 4.6,
"read_time": 20
}
),
VectorDocument(
id="facet_doc5",
content="Computer vision enables machines to interpret and understand visual information from the world.",
embedding=[0.5, 0.6, 0.7, ...], # Your embedding
metadata={
"title": "Computer Vision Basics",
"category": "Computer Vision",
"subcategory": "Image Processing",
"difficulty": "beginner",
"year": 2023,
"author": "Dr. Davis",
"language": "English",
"tags": ["computer vision", "images", "visual"],
"rating": 4.3,
"read_time": 18
}
),
VectorDocument(
id="facet_doc6",
content="Reinforcement learning is an area of machine learning concerned with how agents take actions in an environment.",
embedding=[0.6, 0.7, 0.8, ...], # Your embedding
metadata={
"title": "Reinforcement Learning",
"category": "AI",
"subcategory": "Reinforcement Learning",
"difficulty": "advanced",
"year": 2024,
"author": "Prof. Wilson",
"language": "English",
"tags": ["reinforcement", "agents", "environment"],
"rating": 4.7,
"read_time": 30
}
)
]

# Add documents to MongoDB
success = vector_store.add_documents(documents)
print(f"✅ Documents added: {success}")
# Perform faceted search
query = "artificial intelligence machine learning"
facets = ["category", "difficulty", "year", "author"]

results = retriever.retrieve(
query=query,
k=10,
facets=facets
)

print(f"Faceted search results for: '{query}'")
print(f"Found {len(results)} results")
print()

# Display results
for i, result in enumerate(results, 1):
print(f"{i}. Score: {result.score:.3f}")
print(f" Title: {result.chunk.metadata.get('title', 'N/A')}")
print(f" Category: {result.chunk.metadata.get('category', 'N/A')}")
print(f" Difficulty: {result.chunk.metadata.get('difficulty', 'N/A')}")
print(f" Year: {result.chunk.metadata.get('year', 'N/A')}")
print(f" Content: {result.chunk.content[:100]}...")
print()

# Get facet information
facet_info = retriever.get_facets(query, facets)
print("Facet Information:")
print("=" * 40)

for facet_name, facet_values in facet_info.items():
print(f"\n{facet_name.upper()}:")
for value in facet_values:
print(f" {value['_id']}: {value['count']} documents")

Advanced Faceted Search with Filtering

# Faceted search with metadata filtering
filter_metadata = {
"category": "AI",
"year": {"operator": "$gte", "value": 2023},
"difficulty": {"operator": "$in", "value": ["beginner", "intermediate"]}
}

results = retriever.retrieve(
query="machine learning neural networks",
k=10,
facets=["category", "subcategory", "difficulty", "rating"],
filter_metadata=filter_metadata
)

print(f"Filtered faceted search results:")
print(f"Found {len(results)} results matching filters")
print()

for result in results:
print(f"• {result.chunk.metadata.get('title', 'N/A')}")
print(f" Category: {result.chunk.metadata.get('category', 'N/A')}")
print(f" Subcategory: {result.chunk.metadata.get('subcategory', 'N/A')}")
print(f" Difficulty: {result.chunk.metadata.get('difficulty', 'N/A')}")
print(f" Rating: {result.chunk.metadata.get('rating', 'N/A')}")
print(f" Score: {result.score:.3f}")
print()

# Get filtered facet information
facet_info = retriever.get_facets(
"machine learning neural networks",
["category", "subcategory", "difficulty", "rating"],
filter_metadata
)

print("Filtered Facet Information:")
print("=" * 40)

for facet_name, facet_values in facet_info.items():
print(f"\n{facet_name.upper()}:")
for value in facet_values:
print(f" {value['_id']}: {value['count']} documents")
def interactive_faceted_search():
"""Interactive faceted search with user input."""

print("Interactive Faceted Search")
print("=" * 30)

# Get user input
query = input("Enter your search query: ").strip()
if not query:
query = "artificial intelligence"

# Available facets
available_facets = ["category", "subcategory", "difficulty", "year", "author", "language", "rating"]

print(f"\nAvailable facets: {', '.join(available_facets)}")
facet_input = input("Enter facets to include (comma-separated): ").strip()

if facet_input:
facets = [f.strip() for f in facet_input.split(",")]
else:
facets = ["category", "difficulty", "year"]

# Perform search
results = retriever.retrieve(query, k=10, facets=facets)

print(f"\nSearch Results for: '{query}'")
print(f"Found {len(results)} results")
print()

# Display results
for i, result in enumerate(results, 1):
print(f"{i}. {result.chunk.metadata.get('title', 'N/A')}")
print(f" Score: {result.score:.3f}")
for facet in facets:
value = result.chunk.metadata.get(facet, 'N/A')
print(f" {facet.capitalize()}: {value}")
print(f" Content: {result.chunk.content[:80]}...")
print()

# Display facets
facet_info = retriever.get_facets(query, facets)
print("Facet Breakdown:")
print("-" * 20)

for facet_name, facet_values in facet_info.items():
print(f"\n{facet_name.upper()}:")
for value in facet_values:
print(f" {value['_id']}: {value['count']} documents")

return results

# Run interactive search (uncomment to use)
# results = interactive_faceted_search()

Faceted Search with Range Filters

# Faceted search with range filters
range_filters = {
"year": {"operator": "$gte", "value": 2023},
"rating": {"operator": "$gte", "value": 4.5},
"read_time": {"operator": "$lte", "value": 30}
}

results = retriever.retrieve(
query="machine learning artificial intelligence",
k=10,
facets=["category", "difficulty", "year", "rating"],
filter_metadata=range_filters
)

print(f"Range-filtered faceted search results:")
print(f"Found {len(results)} results with filters:")
print(f" Year >= 2023")
print(f" Rating >= 4.5")
print(f" Read time <= 30 minutes")
print()

for result in results:
print(f"• {result.chunk.metadata.get('title', 'N/A')}")
print(f" Year: {result.chunk.metadata.get('year', 'N/A')}")
print(f" Rating: {result.chunk.metadata.get('rating', 'N/A')}")
print(f" Read Time: {result.chunk.metadata.get('read_time', 'N/A')} min")
print(f" Score: {result.score:.3f}")
print()

Multi-Faceted Search with Complex Filters

# Complex multi-faceted search
complex_filters = {
"category": {"operator": "$in", "value": ["AI", "NLP"]},
"difficulty": {"operator": "$in", "value": ["intermediate", "advanced"]},
"year": {"operator": "$gte", "value": 2023},
"rating": {"operator": "$gte", "value": 4.5},
"language": "English"
}

results = retriever.retrieve(
query="machine learning neural networks deep learning",
k=15,
facets=["category", "subcategory", "difficulty", "year", "author", "rating"],
filter_metadata=complex_filters
)

print(f"Complex multi-faceted search results:")
print(f"Found {len(results)} results with complex filters")
print()

# Group results by category
from collections import defaultdict
grouped_results = defaultdict(list)

for result in results:
category = result.chunk.metadata.get('category', 'Unknown')
grouped_results[category].append(result)

for category, category_results in grouped_results.items():
print(f"\n{category.upper()} ({len(category_results)} results):")
for result in category_results:
print(f" • {result.chunk.metadata.get('title', 'N/A')}")
print(f" Difficulty: {result.chunk.metadata.get('difficulty', 'N/A')}")
print(f" Rating: {result.chunk.metadata.get('rating', 'N/A')}")
print(f" Score: {result.score:.3f}")

Faceted Search Analytics

def analyze_faceted_search_performance():
"""Analyze faceted search performance and patterns."""

queries = [
"machine learning",
"neural networks",
"artificial intelligence",
"deep learning",
"natural language processing"
]

facets = ["category", "difficulty", "year", "rating"]

print("Faceted Search Analytics")
print("=" * 30)

all_facet_data = {}

for query in queries:
print(f"\nAnalyzing query: '{query}'")

# Get search results
results = retriever.retrieve(query, k=10, facets=facets)

# Get facet information
facet_info = retriever.get_facets(query, facets)

print(f" Results: {len(results)}")

# Analyze facets
for facet_name, facet_values in facet_info.items():
if facet_name not in all_facet_data:
all_facet_data[facet_name] = {}

for value in facet_values:
facet_key = value['_id']
count = value['count']

if facet_key not in all_facet_data[facet_name]:
all_facet_data[facet_name][facet_key] = 0

all_facet_data[facet_name][facet_key] += count

# Show top facets for this query
for facet_name, facet_values in facet_info.items():
top_facets = sorted(facet_values, key=lambda x: x['count'], reverse=True)[:3]
print(f" Top {facet_name}: {[f['_id'] for f in top_facets]}")

# Overall facet analysis
print(f"\nOverall Facet Analysis:")
print("-" * 25)

for facet_name, facet_data in all_facet_data.items():
sorted_facets = sorted(facet_data.items(), key=lambda x: x[1], reverse=True)
print(f"\n{facet_name.upper()}:")
for facet_value, count in sorted_facets[:5]:
print(f" {facet_value}: {count} total occurrences")

return all_facet_data

# Run analytics
facet_analytics = analyze_faceted_search_performance()

Advanced Faceted Search with MongoDB Advanced Retriever

from packages.rag.mongodb_retrievers import MongoDBAdvancedRetriever

# Use advanced retriever for more control
advanced_retriever = MongoDBAdvancedRetriever(vector_store)

# Compare different search strategies with faceting
query = "machine learning artificial intelligence"

print("Comparing search strategies with faceting:")
print("=" * 50)

# Vector search with faceting
vector_results = advanced_retriever.retrieve(
query, k=5, search_type="vector"
)
print(f"\nVector Search: {len(vector_results)} results")

# Hybrid search with faceting
hybrid_results = advanced_retriever.retrieve(
query, k=5, search_type="hybrid"
)
print(f"Hybrid Search: {len(hybrid_results)} results")

# Faceted search
faceted_results = advanced_retriever.retrieve(
query, k=5, search_type="faceted",
facets=["category", "difficulty", "year"]
)
print(f"Faceted Search: {len(faceted_results)} results")

# Get facets for comparison
facets = advanced_retriever.get_facets(
query, ["category", "difficulty", "year"]
)

print(f"\nFacet Information:")
for facet_name, facet_values in facets.items():
print(f"\n{facet_name.upper()}:")
for value in facet_values:
print(f" {value['_id']}: {value['count']} documents")

Complete Faceted Search Example

def complete_faceted_example():
"""Complete faceted search example with all features."""

# Initialize
vector_store = MongoDBAtlasVectorStore(
uri="mongodb+srv://username:password@cluster.mongodb.net/",
database="recoagent",
collection="documents",
vector_search_index="vector_index"
)

retriever = MongoDBFacetedRetriever(vector_store)

try:
# Add sample documents
sample_docs = [
VectorDocument(
id="complete_facet1",
content="Machine learning enables computers to learn and improve from experience without being explicitly programmed.",
embedding=[0.1] * 384,
metadata={
"title": "ML Basics",
"category": "AI",
"difficulty": "beginner",
"year": 2023,
"rating": 4.5
}
),
VectorDocument(
id="complete_facet2",
content="Neural networks are computing systems inspired by biological neural networks that can learn complex patterns.",
embedding=[0.2] * 384,
metadata={
"title": "Neural Networks",
"category": "AI",
"difficulty": "intermediate",
"year": 2024,
"rating": 4.8
}
)
]

vector_store.add_documents(sample_docs)
print("✅ Sample documents added")

# Perform faceted search
results = retriever.retrieve(
query="machine learning neural networks",
k=5,
facets=["category", "difficulty", "year", "rating"]
)

print(f"✅ Faceted search completed: {len(results)} results")

# Display results
for i, result in enumerate(results, 1):
print(f"{i}. {result.chunk.metadata.get('title', 'N/A')}")
print(f" Score: {result.score:.3f}")
print(f" Category: {result.chunk.metadata.get('category', 'N/A')}")
print(f" Difficulty: {result.chunk.metadata.get('difficulty', 'N/A')}")

# Get facets
facets = retriever.get_facets(
"machine learning neural networks",
["category", "difficulty", "year", "rating"]
)

print(f"\nFacet Information:")
for facet_name, facet_values in facets.items():
print(f"\n{facet_name.upper()}:")
for value in facet_values:
print(f" {value['_id']}: {value['count']} documents")

return results, facets

except Exception as e:
print(f"❌ Error: {e}")
return [], {}

finally:
# Cleanup
vector_store.delete_documents(["complete_facet1", "complete_facet2"])
vector_store.close()

# Run complete example
results, facets = complete_faceted_example()

This faceted search example demonstrates how to leverage MongoDB's aggregation capabilities to provide rich, interactive search experiences with metadata-based filtering and exploration.

Next Steps