Skip to main content
Open in Kaggle  Open in Colab  Download Notebook
This documentation page is also available as an interactive notebook. You can launch the notebook in Kaggle or Colab, or download it for use with an IDE or local Jupyter installation, by clicking one of the above links.
Combine data from multiple tables using inner, left, and cross joins.

Problem

You have related data in separate tables and need to combine them for analysis—customers with orders, products with inventory, or media with metadata.

Solution

What’s in this recipe:
  • Inner join to match rows from both tables
  • Left join to keep all rows from the first table
  • Cross join for Cartesian product (all combinations)
  • Join with filtering, aggregation, and saving results
Use table1.join(table2, on=..., how=...) to combine tables based on matching columns.

Setup

%pip install -qU pixeltable
import pixeltable as pxt
from pixeltable.functions import sum as pxt_sum, count as pxt_count
# Create a fresh directory
pxt.drop_dir('join_demo', force=True)
pxt.create_dir('join_demo')
Connected to Pixeltable database at: postgresql+psycopg://postgres:@/pixeltable?host=/Users/pjlb/.pixeltable/pgdata
Created directory ‘join_demo’.
<pixeltable.catalog.dir.Dir at 0x1494a4a90>

Create sample tables

# Create a customers table
customers = pxt.create_table(
    'join_demo.customers',
    {'customer_id': pxt.Int, 'name': pxt.String, 'email': pxt.String}
)

customers.insert([
    {'customer_id': 1, 'name': 'Alice', 'email': '[email protected]'},
    {'customer_id': 2, 'name': 'Bob', 'email': '[email protected]'},
    {'customer_id': 3, 'name': 'Charlie', 'email': '[email protected]'},
])

customers.collect()
Created table ‘customers’.
Inserting rows into `customers`: 0 rows [00:00, ? rows/s]Inserting rows into `customers`: 3 rows [00:00, 378.55 rows/s]
Inserted 3 rows with 0 errors.
# Create an orders table
orders = pxt.create_table(
    'join_demo.orders',
    {'order_id': pxt.Int, 'customer_id': pxt.Int, 'product': pxt.String, 'amount': pxt.Float}
)

orders.insert([
    {'order_id': 101, 'customer_id': 1, 'product': 'Laptop', 'amount': 999.00},
    {'order_id': 102, 'customer_id': 1, 'product': 'Mouse', 'amount': 29.00},
    {'order_id': 103, 'customer_id': 2, 'product': 'Keyboard', 'amount': 79.00},
    {'order_id': 104, 'customer_id': 4, 'product': 'Monitor', 'amount': 299.00},  # No matching customer
])

orders.collect()
Created table ‘orders’.
Inserting rows into `orders`: 0 rows [00:00, ? rows/s]Inserting rows into `orders`: 4 rows [00:00, 1261.07 rows/s]
Inserted 4 rows with 0 errors.

Inner join (matching rows only)

# Inner join: only rows that match in both tables
customers.join(
    orders,
    on=customers.customer_id == orders.customer_id,
    how='inner'
).select(
    customers.name,
    orders.product,
    orders.amount
).collect()

Left join (keep all from first table)

# Left join: all customers, with order data where available
# Charlie has no orders, so product/amount will be null
customers.join(
    orders,
    on=customers.customer_id == orders.customer_id,
    how='left'
).select(
    customers.name,
    orders.product,
    orders.amount
).collect()

Join with filtering

# Combine join with where clause to filter results
customers.join(
    orders,
    on=customers.customer_id == orders.customer_id,
    how='inner'
).where(
    orders.amount > 50
).select(
    customers.name,
    customers.email,
    orders.product,
    orders.amount
).collect()

Join with aggregation

# Join and aggregate: total spending per customer
customers.join(
    orders,
    on=customers.customer_id == orders.customer_id,
    how='inner'
).group_by(customers.name).select(
    customers.name,
    total_spent=pxt_sum(orders.amount),
    order_count=pxt_count(orders.order_id)
).collect()

Cross join (all combinations)

# Cross join: every customer paired with every product (no 'on' condition)
products = pxt.create_table(
    'join_demo.products',
    {'product': pxt.String, 'price': pxt.Float}
)
products.insert([
    {'product': 'Widget', 'price': 19.99},
    {'product': 'Gadget', 'price': 29.99},
])

customers.join(products, how='cross').select(
    customers.name,
    products.product,
    products.price
).collect()
Created table ‘products’.
Inserting rows into `products`: 0 rows [00:00, ? rows/s]Inserting rows into `products`: 2 rows [00:00, 815.30 rows/s]
Inserted 2 rows with 0 errors.

Save join results to a new table

# Build a join query and collect as DataFrame
customer_orders_df = customers.join(
    orders,
    on=customers.customer_id == orders.customer_id,
    how='inner'
).select(
    name=customers.name,
    email=customers.email,
    product=orders.product,
    amount=orders.amount
).collect().to_pandas()

customer_orders_df
# Create a new table from the DataFrame
orders_report = pxt.create_table(
    'join_demo.orders_report',
    source=customer_orders_df
)

orders_report.collect()
Created table ‘orders_report’.
Inserting rows into `orders_report`: 0 rows [00:00, ? rows/s]Inserting rows into `orders_report`: 3 rows [00:00, 529.36 rows/s]
Inserted 3 rows with 0 errors.

Explanation

Join types:
Join syntax:
# Simple: join on column by name
t1.join(t2, on=t1.id)

# Explicit predicate
t1.join(t2, on=t1.customer_id == t2.customer_id)

# Composite key
t1.join(t2, on=(t1.pk1 == t2.pk1) & (t1.pk2 == t2.pk2))
Aggregation functions:
from pixeltable.functions import sum, count, mean, min, max

# Use as functions, not methods
total=sum(t.amount)
num_rows=count(t.id)
Saving join results:
# Collect as DataFrame, then create table
df = query.select(name=t.col, ...).collect().to_pandas()
new_table = pxt.create_table('path', source=df)
Tips:
  • Use explicit predicates (t1.col == t2.col) for clarity
  • Chain .where() after join to filter results
  • Chain .group_by() for aggregations
  • Use 'left' join when the first table is your “main” table
  • Use named columns in .select(name=col) for clean column names

See also