Building a Website Search Workflow

Pixeltable website search works in two phases:

  1. Define your workflow structure (once)
  2. Query your content database (anytime)
1

Install Dependencies

pip install pixeltable tiktoken sentence-transformers

Define Your Workflow

Create table.py:

import pixeltable as pxt
from pixeltable.iterators import DocumentSplitter
from pixeltable.functions.huggingface import sentence_transformer

# Initialize app structure
pxt.drop_dir("web_search", force=True)
pxt.create_dir("web_search")

# Create website table
websites_t = pxt.create_table(
    "web_search.websites", 
    {"website": pxt.Document}
)

# Create chunked view for efficient processing
websites_chunks = pxt.create_view(
    "web_search.website_chunks",
    websites_t,
    iterator=DocumentSplitter.create(
        document=websites_t.website,
        separators="token_limit",
        limit=300  # Tokens per chunk
    )
)

# Configure embedding model
embed_model = sentence_transformer.using(
    model_id="intfloat/e5-large-v2"
)

# Add search capability
websites_chunks.add_embedding_index(
    column="text",
    string_embed=embed_model
)

# Define search query
@pxt.query
def search_content(query_text: str, limit: int = 5):
    sim = websites_chunks.text.similarity(query_text)
    return (
        websites_chunks.order_by(sim, asc=False)
        .select(
            websites_chunks.text,
            websites_chunks.website,
            similarity=sim
        )
        .limit(limit)
    )

Use Your Workflow

Create app.py:

import pixeltable as pxt
import time

# Connect to your tables
websites_t = pxt.get_table("web_search.websites")
websites_chunks = pxt.get_view("web_search.website_chunks")

# Add websites with rate limiting
urls = [
    "https://quotes.toscrape.com/",
    "https://example.com",
    "https://docs.pixeltable.io"
]

for url in urls:
    try:
        websites_t.insert({"website": url})
        time.sleep(1)  # Respect rate limits
    except Exception as e:
        print(f"Failed to process {url}: {e}")

# Search content
@pxt.query
def find_content(query: str, top_k: int = 5):
    sim = websites_chunks.text.similarity(query)
    return (
        websites_chunks.order_by(sim, asc=False)
        .select(
            websites_chunks.text,
            websites_chunks.website,
            similarity=sim
        )
        .limit(top_k)
    )

# Example search
results = find_content(
    "Find inspirational quotes about life"
).collect()

# Print results
for r in results:
    print(f"Similarity: {r['similarity']:.3f}")
    print(f"Source: {r['website']}")
    print(f"Content: {r['text']}\n")

What Makes This Different?

Web Scraping

Automatic content extraction:

websites_t.insert({
    "website": "https://example.com"
})

Smart Chunking

Token-aware content splitting:

iterator=DocumentSplitter.create(
    document=websites_t.website,
    separators="token_limit"
)

Vector Search

Natural language search:

websites_chunks.add_embedding_index(
    column="text",
    string_embed=embed_model
)

Workflow Components

Advanced Usage

Custom Chunking Strategies

Configure different chunking approaches:

# Chunk by paragraphs
chunks_by_para = pxt.create_view(
    "web_search.para_chunks",
    websites_t,
    iterator=DocumentSplitter.create(
        document=websites_t.website,
        separators="paragraph"
    )
)

# Chunk by fixed size
chunks_by_size = pxt.create_view(
    "web_search.size_chunks",
    websites_t,
    iterator=DocumentSplitter.create(
        document=websites_t.website,
        separators="fixed",
        size=1000  # characters
    )
)

Batch Processing with Rate Limiting

Process multiple websites responsibly:

import time
from typing import List

def batch_process_urls(urls: List[str], delay: float = 1.0):
    """Process multiple URLs with rate limiting"""
    results = []
    for url in urls:
        try:
            websites_t.insert({"website": url})
            results.append({"url": url, "status": "success"})
        except Exception as e:
            results.append({
                "url": url, 
                "status": "failed", 
                "error": str(e)
            })
        time.sleep(delay)
    return results

Advanced Search Functions

Create specialized search functions:

@pxt.query
def search_with_metadata(
    query: str,
    min_similarity: float = 0.7,
    limit: int = 5
):
    sim = websites_chunks.text.similarity(query)
    return (
        websites_chunks.where(sim >= min_similarity)
        .order_by(sim, asc=False)
        .select(
            websites_chunks.text,
            websites_chunks.website,
            similarity=sim
        )
        .limit(limit)
    )