Pixeltable is the open-source Python library providing declarative data infrastructure for multimodal AI applications—enabling incremental storage, transformation, indexing, retrieval, and orchestration of data.
Installation
Core Operations
# Create
pxt.create_dir( 'project' )
pxt.create_dir( 'project/subdir' )
# List
pxt.ls() # DataFrame of contents
pxt.list_dirs() # List of directory names
pxt.list_tables() # List of table names
contents = pxt.get_dir_contents( 'project' )
# Move/rename
pxt.move( 'old/path' , 'new/path' )
# Delete
pxt.drop_dir( 'project' , force = True )
# Create
t = pxt.create_table( 'project/users' , {
'id' : pxt.Required[pxt.String],
'name' : pxt.String,
'age' : pxt.Int,
'created_at' : pxt.Timestamp,
}, primary_key = 'id' )
# Get
t = pxt.get_table( 'project/users' )
# Operations
t.describe() # Show schema
t.count() # Row count
t.columns() # Column names
metadata = t.get_metadata() # Table metadata
t.list_views() # Views based on this table
# Modify schema
t.add_column( email = pxt.String)
t.rename_column( 'old' , 'new' )
t.drop_column( 'column' )
# Delete
pxt.drop_table( 'project/users' )
API Reference Full directory and table operations
Type System
Core Types Python Pixeltable strpxt.Stringintpxt.Intfloatpxt.Floatboolpxt.Booldatetimepxt.Timestampdatepxt.Datedictpxt.Jsonbytespxt.BinaryUUIDpxt.UUID
Media & Arrays pxt.Image # Images
pxt.Video # Videos
pxt.Audio # Audio
pxt.Document # PDFs, Word docs
# Arrays
pxt.Array[( 768 ,), pxt.Float] # Fixed-dim
pxt.Array[( None ,), pxt.Float] # Variable-dim
# Non-nullable
pxt.Required[pxt.String]
Type System Guide Complete type system documentation
Queries & Data Operations
Insert/Update/Delete
Queries
Aggregations & Joins
# Insert
t.insert([{ 'name' : 'Alice' , 'age' : 30 }])
t.insert([User( name = 'Alice' , age = 30 )]) # Pydantic
# Update
t.where(t.name == 'Alice' ).update({ 'age' : 31 })
t.batch_update([{ '_rowid' : id1, 'score' : 100 }])
# Delete
t.where(t.age < 18 ).delete()
# Recompute
t.recompute_columns([ 'embedding' ])
# Basic
t.collect() # All rows
t.head( 10 ) # First 10
t.select(t.name, t.age).collect() # Specific columns
# Filter
t.where(t.age > 25 ).collect()
t.where((t.age > 25 ) & (t.active == True )).collect()
# Order & limit
t.order_by(t.created_at, asc = False ).limit( 100 ).collect()
# Sample
t.sample( fraction = 0.1 ).collect()
t.sample( n = 100 , seed = 42 ).collect()
# Aggregate
import pixeltable.functions as pxtf
t.group_by(t.category).select(
t.category,
total = pxtf.count(t.id),
average = pxtf.mean(t.score)
).collect()
# Join
orders.join(users, on = (orders.user_id == users.id)).collect()
Queries Guide Learn query expressions
Computed Columns
# Basic expression
t.add_computed_column( total = t.price * t.quantity)
# With LLM
from pixeltable.functions.openai import chat_completions
t.add_computed_column(
summary = chat_completions(
messages = [{ 'role' : 'user' , 'content' : t.text}],
model = 'gpt-4o-mini'
).choices[ 0 ].message.content
)
# With UDF
@pxt.udf
def word_count ( text : str ) -> int :
return len (text.split())
t.add_computed_column( words = word_count(t.content))
Options:
stored=False — Compute on demand (not stored)
destination='s3://bucket/path/' — Store media in S3
Computed Columns Guide Full computed columns documentation
Custom Functions (UDFs)
@pxt.udf
def multiply ( a : int , b : int ) -> int :
return a * b
t.select( result = multiply(t.x, t.y)).collect()
from pixeltable.func import Batch
@pxt.udf ( batch_size = 32 )
def embed ( texts : Batch[ str ]) -> Batch[list[ float ]]:
return model.encode(texts).tolist()
Query Function (@pxt.query)
@pxt.query
def top_k_search ( query : str , k : int = 5 ):
"""Reusable query function."""
sim = docs.content.similarity( string = query)
return docs.order_by(sim, asc = False ).select(docs.content, score = sim).limit(k)
# Use in computed column
t.add_computed_column( context = top_k_search(t.question))
# Convert table to callable function
lookup = pxt.retrieval_udf(
kb_table,
name = 'search_kb' ,
description = 'Search knowledge base' ,
parameters = [ 'topic' ],
limit = 5
)
results = lookup( topic = 'python' )
tools = pxt.tools(lookup) # Use as LLM tool
User-Defined Aggregates (UDA)
@pxt.uda
class std ( pxt . Aggregator ):
def __init__ ( self ):
self .sum = 0.0
self .sum_sq = 0.0
self .count = 0
def update ( self , val : float | None ):
if val is not None :
self .sum += val
self .sum_sq += val * val
self .count += 1
def value ( self ) -> float | None :
if self .count == 0 :
return None
mean = self .sum / self .count
variance = ( self .sum_sq / self .count) - (mean * mean)
return variance ** 0.5
# Connect to MCP server
udfs = pxt.mcp_udfs( 'http://localhost:8000/mcp' )
t.add_computed_column( result = udfs[ 0 ](t.data))
UDFs Guide Complete UDF documentation
Embedding Indexes & Vector Search
Create Index
from pixeltable.functions.openai import embeddings
t.add_embedding_index(
'idx' ,
column = t.content,
embedding = embeddings( input = t.content, model = 'text-embedding-3-small' ),
metric = 'cosine' # 'cosine', 'ip', 'l2'
)
Search with .similarity()
sim = t.content.similarity( string = 'query' )
results = t.order_by(sim, asc = False ).select(t.content, score = sim).limit( 10 )
Or use named index
results = t.idx.similarity_search( 'query' , k = 10 )
# Get raw embeddings
t.select(t.content, emb = t.content.embedding()).collect()
# Drop index
t.drop_embedding_index( 'idx' )
Iterators
from pixeltable.functions.video import frame_iterator
from pixeltable.functions.document import document_splitter
Video Frames
Document Chunking
frames = pxt.create_view(
'project/frames' ,
videos,
iterator = frame_iterator( video = videos.video, fps = 1 )
)
chunks = pxt.create_view(
'project/chunks' ,
docs,
iterator = document_splitter(
document = docs.document,
separators = 'paragraph' ,
limit = 500 ,
overlap = 50
)
)
Iterators Guide Full documentation
Document Chunking RAG chunking cookbook
Views & Snapshots
# Filtered view
active = pxt.create_view(
'project/active_users' ,
users.where(users.status == 'active' )
)
# View with iterator
frames = pxt.create_view(
'project/frames' ,
videos,
iterator = frame_iterator( video = videos.video, fps = 2 )
)
# Snapshot (read-only)
snapshot = pxt.create_snapshot( 'project/backup' , users)
Views Guide Full views documentation
Version Control
t.history() # Version history
t.history( n = 10 ) # Last 10 versions
versions = t.get_versions()
t.revert() # Undo last change (cannot be undone!)
t.revert() cannot be undone! Use with caution.
Version Control Guide Full documentation
Data Sharing
# Publish to cloud
pxt.publish(
source = 'my/table' ,
destination_uri = 'pxt://username/dataset' ,
access = 'public' # or 'private' (default)
)
t.push() # Sync local → cloud
Data Sharing Guide Full documentation
AI Integrations
OpenAI
from pixeltable.functions.openai import chat_completions, embeddings, invoke_tools
# Chat
t.add_computed_column(
response = chat_completions(
messages = [{ 'role' : 'user' , 'content' : t.prompt}],
model = 'gpt-4o'
).choices[ 0 ].message.content
)
# Embeddings
t.add_computed_column(
emb = embeddings( input = t.text, model = 'text-embedding-3-small' ).data[ 0 ].embedding
)
# Vision (multimodal)
t.add_computed_column(
desc = chat_completions(
messages = [{
'role' : 'user' ,
'content' : [
{ 'type' : 'text' , 'text' : 'Describe' },
{ 'type' : 'image_url' , 'image_url' : { 'url' : t.image}}
]
}],
model = 'gpt-4o'
).choices[ 0 ].message.content
)
# Tool calling
t.add_computed_column( tool_results = invoke_tools(tools, t.llm_response))
Other OpenAI functions: image_generations (DALL-E), speech (TTS), transcriptions (Whisper), translations, moderations
OpenAI Guide Provider guide
Other Major Providers
Anthropic
Google Gemini
AWS Bedrock
from pixeltable.functions.anthropic import messages
t.add_computed_column(
response = messages(
messages = [{ 'role' : 'user' , 'content' : t.prompt}],
model = 'claude-sonnet-4-20250514'
).content[ 0 ].text
)
from pixeltable.functions.gemini import generate_content
t.add_computed_column(
text = generate_content( contents = t.prompt, model = 'gemini-2.5-flash' ).text
)
from pixeltable.functions.bedrock import converse
t.add_computed_column(
response = converse(
messages = [{ 'role' : 'user' , 'content' : [{ 'text' : t.prompt}]}],
model_id = 'anthropic.claude-3-5-sonnet-20241022-v2:0'
).output.message.content[ 0 ].text
)
All LLM Providers
Provider Module Key Functions Guide Together AI togetherchat_completions, embeddings, image_generationsGuide Groq groqchat_completions, invoke_toolsGuide Fireworks fireworkschat_completionsGuide Mistral AI mistralaichat_completions, fim_completions, embeddingsGuide DeepSeek deepseekchat_completionsGuide OpenRouter openrouterchat_completionsGuide Ollama ollamachat, generate, embedGuide Llama.cpp llama_cppcreate_chat_completionGuide Replicate replicaterunGuide fal falrunGuide Voyage AI voyageaiembeddings, rerank, multimodal_embedGuide
Hugging Face
20+ functions: embeddings, vision, generation, NLP, speech
from pixeltable.functions.huggingface import sentence_transformer, detr_for_object_detection, image_to_image
t.add_computed_column( emb = sentence_transformer(t.text, model_id = 'all-MiniLM-L6-v2' ))
t.add_computed_column( obj = detr_for_object_detection(t.img, model_id = 'facebook/detr-resnet-50' ))
t.add_computed_column( transformed = image_to_image(t.img, t.prompt, model_id = 'stable-diffusion-v1-5/stable-diffusion-v1-5' ))
Hugging Face Guide Provider guide
Hugging Face API API reference
Local & Specialized Models
WhisperX Advanced transcription
TwelveLabs Video understanding
Built-in Functions
String
Image
Video/Audio
Date/Math/JSON
# Common methods (no import needed)
t.name.lower() / .upper() / .strip() / .replace( 'old' , 'new' ) / .split( ' ' )
t.name.contains( 'text' ) / .startswith( 'A' ) / .endswith( 'z' ) / .len()
t.name.contains_re( r ' \d + ' ) / .findall( r ' \w + ' ) # Regex
String API (40+ functions)
# Properties & Transformations
t.image.width / .height / .mode
t.image.resize(( 256 , 256 )) / .rotate( 90 ) / .crop((x1, y1, x2, y2)) / .convert( 'L' )
t.image.blend(other, alpha = 0.5 ) / .histogram() / .get_metadata()
Image API (25+ functions)
# Video
t.video.get_metadata() / .get_duration() / .extract_frame( timestamp = 5.0 )
t.video.clip( start = 10 , end = 30 ) / .overlay_text( text = 'Title' ) / .scene_detect_content()
# Audio
t.audio.get_metadata()
# Timestamp
t.created_at.year / .month / .day / .hour / .weekday() / .strftime( '%Y-%m- %d ' )
# Date
date.make_date( year = 2024 , month = 1 , day = 1 )
date.add_days(t.date, days = 7 )
# Math
math.abs() / .ceil() / .floor() / .round() / .sqrt() / .pow()
# JSON
t.metadata[ 'key' ][ 'nested' ]
# UUID & Net
uuid7()
presigned_url(t.s3_path, expiration = 3600 )
Data Import/Export
Import
Export
External Storage
from pixeltable import io
# CSV, JSON, Parquet, Excel
t.insert(io.import_csv( 'data.csv' ))
t.insert(io.import_json( 'data.json' ))
t.insert(io.import_parquet( 'data.parquet' ))
t.insert(io.import_excel( 'data.xlsx' , sheet_name = 'Sheet1' ))
# Pandas
import pandas as pd
t.insert(io.import_pandas(pd.read_csv( 'data.csv' )))
# Hugging Face datasets
from datasets import load_dataset
ds = load_dataset( 'squad' , split = 'train[:100]' )
t.insert(io.import_huggingface_dataset(ds))
# Pandas
df = t.collect().to_pandas()
# Parquet
io.export_parquet(t, 'output.parquet' )
# PyTorch
from torch.utils.data import DataLoader
loader = DataLoader(t.to_pytorch_dataset(), batch_size = 32 )
# COCO format
coco_path = t.to_coco_dataset()
# LanceDB
io.export_lancedb(t, 'lancedb_uri' , 'table_name' )
# Label Studio / FiftyOne
io.create_label_studio_project(t, media_column = t.image)
io.export_images_as_fo_dataset(t, img_column = t.image)
# Configure in config.toml or env vars
# PIXELTABLE_OUTPUT_MEDIA_DEST="s3://bucket/path/"
# Per-column destination
t.add_computed_column(
thumbnail = t.image.resize(( 128 , 128 )),
destination = 's3://bucket/thumbnails/'
)
Configuration
# API Keys
import os
os.environ[ 'OPENAI_API_KEY' ] = 'sk-...'
os.environ[ 'ANTHROPIC_API_KEY' ] = 'sk-ant-...'
# Logging
pxt.configure_logging( level = 'INFO' )
# Custom database
pxt.init({ 'home' : '/path/to/data' })
Config file: ~/.pixeltable/config.toml
[ openai ]
api_key = "sk-..."
Configuration Guide Full configuration documentation
# Create tools
tools = pxt.tools(my_udf1, my_udf2)
# Use with LLM
response = chat_completions(
messages = [{ 'role' : 'user' , 'content' : query}],
model = 'gpt-4o' ,
tools = tools,
tool_choice = tools.choice( required = True )
)
# Execute tools
results = invoke_tools(tools, response)
Providers with invoke_tools: OpenAI, Anthropic, Gemini, Bedrock, Groq
Tool Calling Cookbook Full tool calling guide
Common Patterns
from pixeltable.functions.document import document_splitter
from pixeltable.functions.openai import embeddings, chat_completions
# 1. Chunk documents
chunks = pxt.create_view(
'rag/chunks' ,
docs,
iterator = document_splitter( document = docs.doc, separators = 'paragraph' , limit = 500 )
)
# 2. Index
chunks.add_embedding_index(
'idx' ,
column = chunks.text,
embedding = embeddings( input = chunks.text, model = 'text-embedding-3-small' )
)
# 3. Query function
@pxt.query
def retrieve ( query : str , k : int = 5 ):
sim = chunks.text.similarity( string = query)
return chunks.order_by(sim, asc = False ).select(chunks.text, score = sim).limit(k)
# 4. Generate answers
qa = pxt.create_table( 'rag/qa' , { 'question' : pxt.String})
qa.add_computed_column( context = retrieve(qa.question))
qa.add_computed_column(
answer = chat_completions(
messages = [
{ 'role' : 'system' , 'content' : 'Answer using context: ' + qa.context},
{ 'role' : 'user' , 'content' : qa.question}
],
model = 'gpt-4o-mini'
).choices[ 0 ].message.content
)
# Memory store
memories = pxt.create_table( 'agent/memories' , { 'content' : pxt.String})
memories.add_embedding_index( 'idx' , column = memories.content, embedding = ... )
# Retrieval
@pxt.query
def recall ( context : str , k : int = 3 ):
sim = memories.content.similarity( string = context)
return memories.order_by(sim, asc = False ).limit(k).select(memories.content)
# Use in conversation
conversations.add_computed_column( recalled = recall(conversations.message))
from pixeltable.functions.video import frame_iterator
from pixeltable.functions.yolox import yolox
frames = pxt.create_view(
'project/frames' ,
videos,
iterator = frame_iterator( video = videos.video, fps = 1 )
)
frames.add_computed_column( detections = yolox(frames.frame))
Object Detection in Videos
Quick Reference
Essential Commands
Task Command Create table pxt.create_table('dir/table', schema)Get table pxt.get_table('dir/table')Query t.where(condition).collect()Add computed t.add_computed_column(name=expr)Embed index t.add_embedding_index('idx', column, embedding)Create view pxt.create_view('name', base, iterator=...)
Decorators
Decorator Purpose @pxt.udfUser-defined function @pxt.udaUser-defined aggregate @pxt.queryReusable query function
Additional Resources
Label Studio Integration Data labeling
FiftyOne Integration Dataset visualization
Pydantic Integration Structured outputs
Deployment Production deployment