Building a Website Search Workflow

Pixeltable website search works in two phases:

  1. Define your workflow structure (once)
  2. Query your content database (anytime)
1

Install Dependencies

pip install pixeltable tiktoken sentence-transformers

Define Your Workflow

Create table.py:

import pixeltable as pxt
from pixeltable.iterators import DocumentSplitter
from pixeltable.functions.huggingface import sentence_transformer

# Initialize app structure
pxt.drop_dir("web_search", force=True)
pxt.create_dir("web_search")

# Create website table
websites_t = pxt.create_table(
    "web_search.websites", 
    {"website": pxt.Document}
)

# Create chunked view for efficient processing
websites_chunks = pxt.create_view(
    "web_search.website_chunks",
    websites_t,
    iterator=DocumentSplitter.create(
        document=websites_t.website,
        separators="token_limit",
        limit=300  # Tokens per chunk
    )
)

# Configure embedding model
embed_model = sentence_transformer.using(
    model_id="intfloat/e5-large-v2"
)

# Add search capability
websites_chunks.add_embedding_index(
    column="text",
    string_embed=embed_model
)

Use Your Workflow

Create app.py:

import pixeltable as pxt
import time

# Connect to your tables
websites_t = pxt.get_table("web_search.websites")
websites_chunks = pxt.get_table("web_search.website_chunks")

# Add websites with rate limiting
urls = [
    "https://quotes.toscrape.com/",
    "https://example.com",
]

websites_t.insert({"website": url} for url in urls)

# Search content
query = "Find inspirational quotes about life"
sim = websites_chunks.text.similarity(query)
top_k = 3
results = (
    websites_chunks.order_by(sim, asc=False)
    .select(
        websites_chunks.text,
        websites_chunks.website,
        similarity=sim
    )
    .limit(top_k)
).collect()

# Print results
for r in results:
    print(f"Similarity: {r['similarity']:.3f}")
    print(f"Source: {r['website']}")
    print(f"Content: {r['text']}\n")

What Makes This Different?

Web Scraping

Automatic content extraction:

websites_t.insert([{"website": "https://example.com"}])

Smart Chunking

Token-aware content splitting:

iterator=DocumentSplitter.create(
    document=websites_t.website,
    separators="token_limit"
)

Vector Search

Natural language search:

websites_chunks.add_embedding_index(
    column="text",
    string_embed=embed_model
)

Workflow Components

Advanced Usage

Custom Chunking Strategies

Configure different chunking approaches:

# Chunk by paragraphs
chunks_by_para = pxt.create_view(
    "web_search.para_chunks",
    websites_t,
    iterator=DocumentSplitter.create(
        document=websites_t.website,
        separators="paragraph"
    )
)

# Chunk by fixed size
chunks_by_size = pxt.create_view(
    "web_search.size_chunks",
    websites_t,
    iterator=DocumentSplitter.create(
        document=websites_t.website,
        separators="fixed",
        size=1000  # characters
    )
)

Advanced Search Functions

Create specialized search functions:

@pxt.query
def search_with_metadata(
    query: str,
    min_similarity: float,
    limit: int
):
    sim = websites_chunks.text.similarity(query)
    return (
        websites_chunks.where(sim >= min_similarity)
        .order_by(sim, asc=False)
        .select(
            websites_chunks.text,
            websites_chunks.website,
            similarity=sim
        )
        .limit(limit)
    )