Join tables to combine data

This documentation page is also available as an interactive notebook. You can launch the notebook in Kaggle or Colab, or download it for use with an IDE or local Jupyter installation, by clicking one of the above links.

Combine data from multiple tables using inner, left, and cross joins.

Problem

You have related data in separate tables and need to combine them for analysis—customers with orders, products with inventory, or media with metadata.

Solution

What’s in this recipe:

Inner join to match rows from both tables
Left join to keep all rows from the first table
Cross join for Cartesian product (all combinations)
Join with filtering, aggregation, and saving results
Paginate results with limit() and offset

Use table1.join(table2, on=..., how=...) to combine tables based on matching columns.

Setup

%pip install -qU pixeltable

import pixeltable as pxt
import pixeltable.functions as pxtf

# Create a fresh directory
pxt.drop_dir('join_demo', force=True)
pxt.create_dir('join_demo')

Connected to Pixeltable database at: postgresql+psycopg://postgres:@/pixeltable?host=/Users/pjlb/.pixeltable/pgdata
Created directory ‘join_demo’.
<pixeltable.catalog.dir.Dir at 0x148e73850>

Create sample tables

# Create a customers table
customers = pxt.create_table(
    'join_demo/customers',
    {'customer_id': pxt.Int, 'name': pxt.String, 'email': pxt.String},
)

customers.insert(
    [
        {'customer_id': 1, 'name': 'Alice', 'email': '[email protected]'},
        {'customer_id': 2, 'name': 'Bob', 'email': '[email protected]'},
        {
            'customer_id': 3,
            'name': 'Charlie',
            'email': '[email protected]',
        },
    ]
)

customers.collect()

Created table ‘customers’.
Inserted 3 rows with 0 errors in 0.01 s (385.68 rows/s)

# Create an orders table
orders = pxt.create_table(
    'join_demo/orders',
    {
        'order_id': pxt.Int,
        'customer_id': pxt.Int,
        'product': pxt.String,
        'amount': pxt.Float,
    },
)

orders.insert(
    [
        {
            'order_id': 101,
            'customer_id': 1,
            'product': 'Laptop',
            'amount': 999.00,
        },
        {
            'order_id': 102,
            'customer_id': 1,
            'product': 'Mouse',
            'amount': 29.00,
        },
        {
            'order_id': 103,
            'customer_id': 2,
            'product': 'Keyboard',
            'amount': 79.00,
        },
        {
            'order_id': 104,
            'customer_id': 4,
            'product': 'Monitor',
            'amount': 299.00,
        },  # No matching customer
    ]
)

orders.collect()

Created table ‘orders’.
Inserted 4 rows with 0 errors in 0.01 s (657.81 rows/s)

Inner join (matching rows only)

# Inner join: only rows that match in both tables
customers.join(
    orders, on=customers.customer_id == orders.customer_id, how='inner'
).select(customers.name, orders.product, orders.amount).collect()

Left join (keep all from first table)

# Left join: all customers, with order data where available
# Charlie has no orders, so product/amount will be null
customers.join(
    orders, on=customers.customer_id == orders.customer_id, how='left'
).select(customers.name, orders.product, orders.amount).collect()

Join with filtering

# Combine join with where clause to filter results
customers.join(
    orders, on=customers.customer_id == orders.customer_id, how='inner'
).where(orders.amount > 50).select(
    customers.name, customers.email, orders.product, orders.amount
).collect()

Join with aggregation

# Join and aggregate: total spending per customer
customers.join(
    orders, on=customers.customer_id == orders.customer_id, how='inner'
).group_by(customers.name).select(
    customers.name,
    total_spent=pxtf.sum(orders.amount),
    order_count=pxtf.count(orders.order_id),
).collect()

Cross join (all combinations)

# Cross join: every customer paired with every product (no 'on' condition)
products = pxt.create_table(
    'join_demo/products', {'product': pxt.String, 'price': pxt.Float}
)
products.insert(
    [
        {'product': 'Widget', 'price': 19.99},
        {'product': 'Gadget', 'price': 29.99},
    ]
)

customers.join(products, how='cross').select(
    customers.name, products.product, products.price
).collect()

Created table ‘products’.
Inserted 2 rows with 0 errors in 0.00 s (422.52 rows/s)

Save join results to a new table

# Build a join query and collect as DataFrame
customer_orders_df = (
    customers.join(
        orders,
        on=customers.customer_id == orders.customer_id,
        how='inner',
    )
    .select(
        name=customers.name,
        email=customers.email,
        product=orders.product,
        amount=orders.amount,
    )
    .collect()
    .to_pandas()
)

customer_orders_df

# Create a new table from the DataFrame
orders_report = pxt.create_table(
    'join_demo/orders_report', source=customer_orders_df
)

orders_report.collect()

Created table ‘orders_report’.
Inserted 3 rows with 0 errors in 0.01 s (500.32 rows/s)

Paginate results with limit and offset

Use limit(n, offset=k) to retrieve results in pages. This is useful for displaying results incrementally or building paginated APIs.

# Page 1: first 2 rows
orders.order_by(orders.order_id).limit(2).collect()

# Page 2: next 2 rows (skip the first 2)
orders.order_by(orders.order_id).limit(2, offset=2).collect()

Explanation

Join types:

Join syntax:

# Simple: join on column by name
t1.join(t2, on=t1.id)

# Explicit predicate
t1.join(t2, on=t1.customer_id == t2.customer_id)

# Composite key
t1.join(t2, on=(t1.pk1 == t2.pk1) & (t1.pk2 == t2.pk2))

Aggregation functions:

from pixeltable.functions import sum, count, mean, min, max

# Use as functions, not methods
total=sum(t.amount)
num_rows=count(t.id)

Saving join results:

# Collect as DataFrame, then create table
df = query.select(name=t.col, ...).collect().to_pandas()
new_table = pxt.create_table('path', source=df)

Pagination:

# limit(n) returns at most n rows
# limit(n, offset=k) skips the first k rows, then returns n
query.order_by(t.id).limit(10)           # rows 0-9
query.order_by(t.id).limit(10, offset=10) # rows 10-19

Tips:

Use explicit predicates (t1.col == t2.col) for clarity
Chain .where() after join to filter results
Chain .group_by() for aggregations
Use 'left' join when the first table is your “main” table
Use named columns in .select(name=col) for clean column names
Always use .order_by() with pagination to get deterministic page ordering

Welcome to Pixeltable

Core Concepts

How-To

Problem

Solution

Setup

Create sample tables

Inner join (matching rows only)

Left join (keep all from first table)

Join with filtering

Join with aggregation

Cross join (all combinations)

Save join results to a new table

Paginate results with limit and offset

Explanation

See also

Welcome to Pixeltable

Core Concepts

How-To

​Problem

​Solution

​Setup

​Create sample tables

​Inner join (matching rows only)

​Left join (keep all from first table)

​Join with filtering

​Join with aggregation

​Cross join (all combinations)

​Save join results to a new table

​Paginate results with limit and offset

​Explanation

​See also

Problem

Solution

Setup

Create sample tables

Inner join (matching rows only)

Left join (keep all from first table)

Join with filtering

Join with aggregation

Cross join (all combinations)

Save join results to a new table

Paginate results with limit and offset

Explanation

See also