Build a PDF search system using smart chunking and vector embeddings
Install Dependencies
pip install pixeltable tiktoken sentence-transformers
Define Your Workflow
table.py
import pixeltable as pxt from pixeltable.iterators import DocumentSplitter from pixeltable.functions.huggingface import sentence_transformer # Initialize app structure pxt.drop_dir("pdf_search", force=True) pxt.create_dir("pdf_search") # Create documents table documents_t = pxt.create_table( "pdf_search.documents", {"pdf": pxt.Document} ) # Create chunked view for efficient processing documents_chunks = pxt.create_view( "pdf_search.document_chunks", documents_t, iterator=DocumentSplitter.create( document=documents_t.pdf, separators="token_limit", limit=300 # Tokens per chunk ) ) # Configure embedding model embed_model = sentence_transformer.using( model_id="intfloat/e5-large-v2" ) # Add search capability documents_chunks.add_embedding_index( column="text", string_embed=embed_model )
Use Your Workflow
app.py
import pixeltable as pxt # Connect to your tables documents_t = pxt.get_table("pdf_search.documents") documents_chunks = pxt.get_table("pdf_search.document_chunks") # Sample document URLs DOCUMENT_URL = ( "https://github.com/pixeltable/pixeltable/raw/release/docs/resources/rag-demo/" ) document_urls = [ DOCUMENT_URL + doc for doc in [ "Argus-Market-Digest-June-2024.pdf", "Company-Research-Alphabet.pdf", "Zacks-Nvidia-Report.pdf", ] ] # Add documents to database documents_t.insert({"pdf": url} for url in document_urls) # Search documents query = "What are the growth projections for tech companies?" top_n = 3 sim = documents_chunks.text.similarity(query) result = ( documents_chunks.order_by(sim, asc=False) .select(documents_chunks.text, sim=sim) .collect() ) # Print results for i in result: print(f"Similarity: {i['sim']:.3f}") print(f"Text: {i['text']}\n")
iterator=DocumentSplitter.create( document=documents_t.pdf, separators="token_limit", limit=300 )
documents_chunks.add_embedding_index( column="text", string_embed=embed_model )
documents_t.insert([{"pdf": new_url}]) # Chunking and embeddings update automatically
PDF Processing
Text Chunking
Vector Search
# Chunk by paragraphs chunks_by_para = pxt.create_view( "pdf_search.para_chunks", documents_t, iterator=DocumentSplitter.create( document=documents_t.pdf, separators="paragraph" ) ) # Chunk by fixed size chunks_by_size = pxt.create_view( "pdf_search.size_chunks", documents_t, iterator=DocumentSplitter.create( document=documents_t.pdf, separators="fixed", size=1000 # characters ) )
# Bulk document insertion pdf_urls = [ "https://example.com/doc1.pdf", "https://example.com/doc2.pdf", "https://example.com/doc3.pdf" ] documents_t.insert({"pdf": url} for url in pdf_urls)
@pxt.query def search_with_metadata( query: str, min_similarity: float, limit: int ): sim = documents_chunks.text.similarity(query) return ( documents_chunks.where(sim >= min_similarity) .order_by(sim, asc=False) .select( documents_chunks.text, documents_chunks.pdf_name, documents_chunks.page_number, similarity=sim ) .limit(limit) )
Was this page helpful?