import pixeltable as pxtfrom pixeltable.functions import openaifrom pixeltable.functions.huggingface import sentence_transformerfrom pixeltable.functions.video import extract_audiofrom pixeltable.iterators import AudioSplitter, FrameIteratorfrom pixeltable.iterators.string import StringSplitterfrom pixeltable.functions.openai import vision# Define the embedding model once for reuseEMBED_MODEL = sentence_transformer.using(model_id='intfloat/e5-large-v2')# Set up directory and table namedirectory = 'video_index'table_name = f'{directory}.video'# Create video tablepxt.create_dir(directory, if_exists='replace_force')video_index = pxt.create_table( table_name, {'video': pxt.Video, 'uploaded_at': pxt.Timestamp})video_index.add_computed_column( audio_extract=extract_audio(video_index.video, format='mp3')) # Create view for framesframes_view = pxt.create_view( f'{directory}.video_frames', video_index, iterator=FrameIterator.create( video=video_index.video, fps=1 ))# Create a column for image description using OpenAI gpt-4o-miniframes_view.add_computed_column( image_description=vision( prompt="Provide quick caption for the image.", image=frames_view.frame, model="gpt-4o-mini" )) # Create embedding index for image descriptionframes_view.add_embedding_index('image_description', string_embed=EMBED_MODEL) # Create view for audio chunkschunks_view = pxt.create_view( f'{directory}.video_chunks', video_index, iterator=AudioSplitter.create( audio=video_index.audio_extract, chunk_duration_sec=30.0, overlap_sec=2.0, min_chunk_duration_sec=5.0 ))# Audio-to-text for chunkschunks_view.add_computed_column( transcription=openai.transcriptions( audio=chunks_view.audio_chunk, model='whisper-1' ))# Create view that chunks text into sentencestranscription_chunks = pxt.create_view( f'{directory}.video_sentence_chunks', chunks_view, iterator=StringSplitter.create(text=chunks_view.transcription.text, separators='sentence'),)# Create embedding index for audiotranscription_chunks.add_embedding_index('text', string_embed=EMBED_MODEL)
Use Your Workflow
Create app.py:
Copy
Ask AI
from datetime import datetimeimport pixeltable as pxt# Constantsdirectory = 'video_index'table_name = f'{directory}.video'# Connect to your tables and viewsvideo_index = pxt.get_table(table_name)frames_view = pxt.get_table(f'{directory}.video_frames')transcription_chunks = pxt.get_table(f'{directory}.video_sentence_chunks')# Insert videos to the knowledge basevideos = [ 'https://github.com/pixeltable/pixeltable/raw/release/docs/resources/audio-transcription-demo/' f'Lex-Fridman-Podcast-430-Excerpt-{n}.mp4' for n in range(3)]video_index.insert({'video': video, 'uploaded_at': datetime.now()} for video in videos[:2])query_text = "Summarize the conversation"audio_sim = transcription_chunks.text.similarity(query_text)audio_results = ( transcription_chunks.order_by(audio_sim, transcription_chunks.uploaded_at, asc=False) .limit(5) .select(transcription_chunks.text, transcription_chunks.uploaded_at, similarity=audio_sim) .collect())print(audio_results)
Process both audio and visual content from the same videos:
Copy
Ask AI
# Extract audio from videovideo_index.add_computed_column(audio_extract=extract_audio(video_index.video, format='mp3'))# Extract frames from videoframes_view = pxt.create_view( f'{directory}.video_frames', video_index, iterator=FrameIterator.create( video=video_index.video, fps=1 ))# Optionally, create new videos from processed framesfrom pixeltable.functions.video import make_videoprocessed_videos = frames_view.select( frames_view.video_id, make_video(frames_view.pos, frames_view.frame) # Default fps is 25).group_by(frames_view.video_id).collect()
AI-Powered Frame Analysis
Automatic image description using vision models:
Copy
Ask AI
frames_view.add_computed_column( image_description=vision( prompt="Provide quick caption for the image.", image=frames_view.frame, model="gpt-4o-mini" ))
Unified Embedding Space
Use the same embedding model for both text and image descriptions:
Copy
Ask AI
# Define once, use for both modalitiesEMBED_MODEL = sentence_transformer.using(model_id='intfloat/e5-large-v2')# Use for frame descriptionsframes_view.add_embedding_index('image_description', string_embed=EMBED_MODEL)# Use for transcriptionstranscription_chunks.add_embedding_index('text', string_embed=EMBED_MODEL)
Dual Search Capabilities
Search independently across audio or visual content:
Copy
Ask AI
# Get similarity scores audio_sim = transcription_chunks.text.similarity("Definition of happiness according to the guest")image_sim = frames_view.image_description.similarity("Lex Fridman interviewing a guest in a podcast setting")