Building a Audio Search Workflow

Pixeltable lets you build audio search workflows in two phases:

  1. Define your processing workflow (once)
  2. Query your knowledge base (anytime)
1

Install Dependencies

pip install boto3 pixeltable tiktoken openai openai-whisper spacy sentence-transformers
python -m spacy download en_core_web_sm

Define Your Workflow

Create table.py:

import pixeltable as pxt
from pixeltable.functions import whisper
from pixeltable.functions.huggingface import sentence_transformer
from pixeltable.iterators.string import StringSplitter
import spacy

# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

# Initialize app structure
pxt.drop_dir("audio_search", force=True)
pxt.create_dir("audio_search")

# Create audio table
audio_t = pxt.create_table(
    "audio_search.audio", 
    {"audio_file": pxt.Audio}
)

# Add transcription workflow
audio_t.add_computed_column(
    transcription=whisper.transcribe(
        audio=audio_t.audio_file, 
        model="base.en"
    )
)

# Create sentence-level view
sentences_view = pxt.create_view(
    "audio_search.audio_sentence_chunks",
    audio_t,
    iterator=StringSplitter.create(
        text=audio_t.transcription.text, 
        separators="sentence"
    )
)

# Configure embedding model
embed_model = sentence_transformer.using(
    model_id="intfloat/e5-large-v2"
)

# Add search capability
sentences_view.add_embedding_index(
    column="text", 
    string_embed=embed_model
)

# Define search query
@pxt.query
def top_k(query_text: str):
    sim = sentences_view.text.similarity(query_text)
    return (
        sentences_view.order_by(sim, asc=False)
        .select(sentences_view.text, sim=sim)
        .limit(10)
    )

Use Your Workflow

Create app.py:

import pixeltable as pxt

# Connect to your tables and views
audio_t = pxt.get_table("audio_search.audio")
sentences_view = pxt.get_view("audio_search.audio_sentence_chunks")

# Add audio files to the knowledge base
audio_t.insert([{
    "audio_file": "s3://pixeltable-public/audio/10-minute tour of Pixeltable.mp3"
}])

# Perform search
@pxt.query
def search_audio(query_text: str):
    sim = sentences_view.text.similarity(query_text)
    return (
        sentences_view.order_by(sim, asc=False)
        .select(sentences_view.text, sim=sim)
        .limit(10)
    )

# Example search
results = search_audio("What are the key features of Pixeltable?")

# Print results
for result in results.collect():
    print(f"Similarity: {result['sim']:.3f}")
    print(f"Text: {result['text']}\n")

What Makes This Different?

Automatic Processing

Workflow handles transcription and embedding automatically:

audio_t.add_computed_column(
    transcription=whisper.transcribe(
        audio=audio_t.audio_file
    )
)

Smart Chunking

Intelligent sentence splitting using spaCy:

iterator=StringSplitter.create(
    text=audio_t.transcription.text,
    separators="sentence"
)

Vector Search

Fast search using E5 embeddings:

sentences_view.add_embedding_index(
    column="text",
    string_embed=embed_model
)

Workflow Components

Advanced Usage

Custom Search Functions

You can create custom search functions with different parameters:

@pxt.query
def search_with_threshold(query_text: str, min_similarity: float = 0.7):
    sim = sentences_view.text.similarity(query_text)
    return (
        sentences_view.where(sim >= min_similarity)
        .order_by(sim, asc=False)
        .select(sentences_view.text, sim=sim)
    )

Batch Processing

Process multiple audio files in batch:

audio_files = [
    "s3://your-bucket/audio1.mp3",
    "s3://your-bucket/audio2.mp3",
    "s3://your-bucket/audio3.mp3"
]

audio_t.insert([{"audio_file": f} for f in audio_files])

Different Embedding Models

You can use different sentence transformer models:

# Alternative embedding models
embed_model = sentence_transformer.using(
    model_id="sentence-transformers/all-mpnet-base-v2"
)
# or
embed_model = sentence_transformer.using(
    model_id="sentence-transformers/all-MiniLM-L6-v2"
)