Multimodal Vector Database
Website
Multimodal Vector Database
Website
Build a web content search system using smart chunking and vector embeddings
Building a Website Search Workflow
Pixeltable website search works in two phases:
- Define your workflow structure (once)
- Query your content database (anytime)
1
Install Dependencies
pip install pixeltable tiktoken sentence-transformers
Define Your Workflow
Create table.py
:
import pixeltable as pxt
from pixeltable.iterators import DocumentSplitter
from pixeltable.functions.huggingface import sentence_transformer
# Initialize app structure
pxt.drop_dir("web_search", force=True)
pxt.create_dir("web_search")
# Create website table
websites_t = pxt.create_table(
"web_search.websites",
{"website": pxt.Document}
)
# Create chunked view for efficient processing
websites_chunks = pxt.create_view(
"web_search.website_chunks",
websites_t,
iterator=DocumentSplitter.create(
document=websites_t.website,
separators="token_limit",
limit=300 # Tokens per chunk
)
)
# Configure embedding model
embed_model = sentence_transformer.using(
model_id="intfloat/e5-large-v2"
)
# Add search capability
websites_chunks.add_embedding_index(
column="text",
string_embed=embed_model
)
# Define search query
@pxt.query
def search_content(query_text: str, limit: int = 5):
sim = websites_chunks.text.similarity(query_text)
return (
websites_chunks.order_by(sim, asc=False)
.select(
websites_chunks.text,
websites_chunks.website,
similarity=sim
)
.limit(limit)
)
Use Your Workflow
Create app.py
:
import pixeltable as pxt
import time
# Connect to your tables
websites_t = pxt.get_table("web_search.websites")
websites_chunks = pxt.get_view("web_search.website_chunks")
# Add websites with rate limiting
urls = [
"https://quotes.toscrape.com/",
"https://example.com",
"https://docs.pixeltable.io"
]
for url in urls:
try:
websites_t.insert({"website": url})
time.sleep(1) # Respect rate limits
except Exception as e:
print(f"Failed to process {url}: {e}")
# Search content
@pxt.query
def find_content(query: str, top_k: int = 5):
sim = websites_chunks.text.similarity(query)
return (
websites_chunks.order_by(sim, asc=False)
.select(
websites_chunks.text,
websites_chunks.website,
similarity=sim
)
.limit(top_k)
)
# Example search
results = find_content(
"Find inspirational quotes about life"
).collect()
# Print results
for r in results:
print(f"Similarity: {r['similarity']:.3f}")
print(f"Source: {r['website']}")
print(f"Content: {r['text']}\n")
What Makes This Different?
Web Scraping
Automatic content extraction:
websites_t.insert({
"website": "https://example.com"
})
Smart Chunking
Token-aware content splitting:
iterator=DocumentSplitter.create(
document=websites_t.website,
separators="token_limit"
)
Vector Search
Natural language search:
websites_chunks.add_embedding_index(
column="text",
string_embed=embed_model
)
Workflow Components
Advanced web handling:
- HTML content extraction
- Text cleaning and normalization
- Structure preservation
- Automatic encoding detection
Intelligent text splitting:
- Token-aware segmentation
- Configurable chunk sizes
- Context preservation
- Multiple chunking strategies
High-quality search:
- E5 text embeddings
- Fast similarity search
- Natural language queries
- Configurable similarity thresholds
Advanced Usage
Custom Chunking Strategies
Configure different chunking approaches:
# Chunk by paragraphs
chunks_by_para = pxt.create_view(
"web_search.para_chunks",
websites_t,
iterator=DocumentSplitter.create(
document=websites_t.website,
separators="paragraph"
)
)
# Chunk by fixed size
chunks_by_size = pxt.create_view(
"web_search.size_chunks",
websites_t,
iterator=DocumentSplitter.create(
document=websites_t.website,
separators="fixed",
size=1000 # characters
)
)
Batch Processing with Rate Limiting
Process multiple websites responsibly:
import time
from typing import List
def batch_process_urls(urls: List[str], delay: float = 1.0):
"""Process multiple URLs with rate limiting"""
results = []
for url in urls:
try:
websites_t.insert({"website": url})
results.append({"url": url, "status": "success"})
except Exception as e:
results.append({
"url": url,
"status": "failed",
"error": str(e)
})
time.sleep(delay)
return results
Advanced Search Functions
Create specialized search functions:
@pxt.query
def search_with_metadata(
query: str,
min_similarity: float = 0.7,
limit: int = 5
):
sim = websites_chunks.text.similarity(query)
return (
websites_chunks.where(sim >= min_similarity)
.order_by(sim, asc=False)
.select(
websites_chunks.text,
websites_chunks.website,
similarity=sim
)
.limit(limit)
)