This documentation page is also available as an interactive notebook. You can launch the notebook in
Kaggle or Colab, or download it for use with an IDE or local Jupyter installation, by clicking one of the
above links.
Combine data from multiple tables using inner, left, and cross joins.
Problem
You have related data in separate tables and need to combine them for
analysis—customers with orders, products with inventory, or media with
metadata.
Solution
What’s in this recipe:
- Inner join to match rows from both tables
- Left join to keep all rows from the first table
- Cross join for Cartesian product (all combinations)
- Join with filtering, aggregation, and saving results
- Paginate results with
limit() and offset
Use table1.join(table2, on=..., how=...) to combine tables based on
matching columns.
Setup
%pip install -qU pixeltable
import pixeltable as pxt
import pixeltable.functions as pxtf
# Create a fresh directory
pxt.drop_dir('join_demo', force=True)
pxt.create_dir('join_demo')
Connected to Pixeltable database at: postgresql+psycopg://postgres:@/pixeltable?host=/Users/pjlb/.pixeltable/pgdata
Created directory ‘join_demo’.
<pixeltable.catalog.dir.Dir at 0x148e73850>
Create sample tables
# Create a customers table
customers = pxt.create_table(
'join_demo/customers',
{'customer_id': pxt.Int, 'name': pxt.String, 'email': pxt.String},
)
customers.insert(
[
{'customer_id': 1, 'name': 'Alice', 'email': '[email protected]'},
{'customer_id': 2, 'name': 'Bob', 'email': '[email protected]'},
{
'customer_id': 3,
'name': 'Charlie',
'email': '[email protected]',
},
]
)
customers.collect()
Created table ‘customers’.
Inserted 3 rows with 0 errors in 0.01 s (385.68 rows/s)
# Create an orders table
orders = pxt.create_table(
'join_demo/orders',
{
'order_id': pxt.Int,
'customer_id': pxt.Int,
'product': pxt.String,
'amount': pxt.Float,
},
)
orders.insert(
[
{
'order_id': 101,
'customer_id': 1,
'product': 'Laptop',
'amount': 999.00,
},
{
'order_id': 102,
'customer_id': 1,
'product': 'Mouse',
'amount': 29.00,
},
{
'order_id': 103,
'customer_id': 2,
'product': 'Keyboard',
'amount': 79.00,
},
{
'order_id': 104,
'customer_id': 4,
'product': 'Monitor',
'amount': 299.00,
}, # No matching customer
]
)
orders.collect()
Created table ‘orders’.
Inserted 4 rows with 0 errors in 0.01 s (657.81 rows/s)
Inner join (matching rows only)
# Inner join: only rows that match in both tables
customers.join(
orders, on=customers.customer_id == orders.customer_id, how='inner'
).select(customers.name, orders.product, orders.amount).collect()
Left join (keep all from first table)
# Left join: all customers, with order data where available
# Charlie has no orders, so product/amount will be null
customers.join(
orders, on=customers.customer_id == orders.customer_id, how='left'
).select(customers.name, orders.product, orders.amount).collect()
Join with filtering
# Combine join with where clause to filter results
customers.join(
orders, on=customers.customer_id == orders.customer_id, how='inner'
).where(orders.amount > 50).select(
customers.name, customers.email, orders.product, orders.amount
).collect()
Join with aggregation
# Join and aggregate: total spending per customer
customers.join(
orders, on=customers.customer_id == orders.customer_id, how='inner'
).group_by(customers.name).select(
customers.name,
total_spent=pxtf.sum(orders.amount),
order_count=pxtf.count(orders.order_id),
).collect()
Cross join (all combinations)
# Cross join: every customer paired with every product (no 'on' condition)
products = pxt.create_table(
'join_demo/products', {'product': pxt.String, 'price': pxt.Float}
)
products.insert(
[
{'product': 'Widget', 'price': 19.99},
{'product': 'Gadget', 'price': 29.99},
]
)
customers.join(products, how='cross').select(
customers.name, products.product, products.price
).collect()
Created table ‘products’.
Inserted 2 rows with 0 errors in 0.00 s (422.52 rows/s)
Save join results to a new table
# Build a join query and collect as DataFrame
customer_orders_df = (
customers.join(
orders,
on=customers.customer_id == orders.customer_id,
how='inner',
)
.select(
name=customers.name,
email=customers.email,
product=orders.product,
amount=orders.amount,
)
.collect()
.to_pandas()
)
customer_orders_df
# Create a new table from the DataFrame
orders_report = pxt.create_table(
'join_demo/orders_report', source=customer_orders_df
)
orders_report.collect()
Created table ‘orders_report’.
Inserted 3 rows with 0 errors in 0.01 s (500.32 rows/s)
Paginate results with limit and offset
Use limit(n, offset=k) to retrieve results in pages. This is useful
for displaying results incrementally or building paginated APIs.
# Page 1: first 2 rows
orders.order_by(orders.order_id).limit(2).collect()
# Page 2: next 2 rows (skip the first 2)
orders.order_by(orders.order_id).limit(2, offset=2).collect()
Explanation
Join types:
Join syntax:
# Simple: join on column by name
t1.join(t2, on=t1.id)
# Explicit predicate
t1.join(t2, on=t1.customer_id == t2.customer_id)
# Composite key
t1.join(t2, on=(t1.pk1 == t2.pk1) & (t1.pk2 == t2.pk2))
Aggregation functions:
from pixeltable.functions import sum, count, mean, min, max
# Use as functions, not methods
total=sum(t.amount)
num_rows=count(t.id)
Saving join results:
# Collect as DataFrame, then create table
df = query.select(name=t.col, ...).collect().to_pandas()
new_table = pxt.create_table('path', source=df)
Pagination:
# limit(n) returns at most n rows
# limit(n, offset=k) skips the first k rows, then returns n
query.order_by(t.id).limit(10) # rows 0-9
query.order_by(t.id).limit(10, offset=10) # rows 10-19
Tips:
- Use explicit predicates (
t1.col == t2.col) for clarity
- Chain
.where() after join to filter results
- Chain
.group_by() for aggregations
- Use
'left' join when the first table is your “main” table
- Use named columns in
.select(name=col) for clean column names
- Always use
.order_by() with pagination to get deterministic page
ordering
See also