Build a web content search system using smart chunking and vector embeddings
Install Dependencies
pip install pixeltable tiktoken sentence-transformers
Define Your Workflow
table.py
import pixeltable as pxt from pixeltable.iterators import DocumentSplitter from pixeltable.functions.huggingface import sentence_transformer # Initialize app structure pxt.drop_dir("web_search", force=True) pxt.create_dir("web_search") # Create website table websites_t = pxt.create_table( "web_search.websites", {"website": pxt.Document} ) # Create chunked view for efficient processing websites_chunks = pxt.create_view( "web_search.website_chunks", websites_t, iterator=DocumentSplitter.create( document=websites_t.website, separators="token_limit", limit=300 # Tokens per chunk ) ) # Configure embedding model embed_model = sentence_transformer.using( model_id="intfloat/e5-large-v2" ) # Add search capability websites_chunks.add_embedding_index( column="text", string_embed=embed_model )
Use Your Workflow
app.py
import pixeltable as pxt import time # Connect to your tables websites_t = pxt.get_table("web_search.websites") websites_chunks = pxt.get_table("web_search.website_chunks") # Add websites with rate limiting urls = [ "https://quotes.toscrape.com/", "https://example.com", ] websites_t.insert({"website": url} for url in urls) # Search content query = "Find inspirational quotes about life" sim = websites_chunks.text.similarity(query) top_k = 3 results = ( websites_chunks.order_by(sim, asc=False) .select( websites_chunks.text, websites_chunks.website, similarity=sim ) .limit(top_k) ).collect() # Print results for r in results: print(f"Similarity: {r['similarity']:.3f}") print(f"Source: {r['website']}") print(f"Content: {r['text']}\n")
websites_t.insert([{"website": "https://example.com"}])
iterator=DocumentSplitter.create( document=websites_t.website, separators="token_limit" )
websites_chunks.add_embedding_index( column="text", string_embed=embed_model )
Web Processing
Content Chunking
Vector Search
# Chunk by paragraphs chunks_by_para = pxt.create_view( "web_search.para_chunks", websites_t, iterator=DocumentSplitter.create( document=websites_t.website, separators="paragraph" ) ) # Chunk by fixed size chunks_by_size = pxt.create_view( "web_search.size_chunks", websites_t, iterator=DocumentSplitter.create( document=websites_t.website, separators="fixed", size=1000 # characters ) )
@pxt.query def search_with_metadata( query: str, min_similarity: float, limit: int ): sim = websites_chunks.text.similarity(query) return ( websites_chunks.where(sim >= min_similarity) .order_by(sim, asc=False) .select( websites_chunks.text, websites_chunks.website, similarity=sim ) .limit(limit) )
Was this page helpful?