PyArrow
PyArrow Dataset API (For Larger-Than-Memory Data)
import pyarrow.dataset as ds
import pyarrow as pa
import shutil
import os
# --- Setup: Let's pretend we have a massive dataset ---
# We will create a folder with 2 separate parquet files
os.makedirs('my_big_dataset', exist_ok=True)
table1 = pa.Table.from_pydict({'id': [1, 2, 3], 'year': [2020, 2020, 2020]})
table2 = pa.Table.from_pydict({'id': [4, 5, 6], 'year': [2021, 2021, 2021]})
import pyarrow.parquet as pq
pq.write_table(table1, 'my_big_dataset/part1.parquet')
pq.write_table(table2, 'my_big_dataset/part2.parquet')
# -------------------------------------------------------
# 1. Initialize the Dataset
# This is instant. It just reads the file paths/metadata, not the data itself.
dataset = ds.dataset('my_big_dataset', format='parquet')
print(f"Total files found: {dataset.files}")
# 2. Scan with a Filter (Predicate Pushdown)
# We want data ONLY from 2021.
# PyArrow is smart enough to skip 'part1.parquet' entirely because
# it knows (from metadata) that file doesn't contain year == 2021.
scanner = dataset.scanner(
filter=ds.field('year') == 2021,
columns=['id', 'year']
)
# 3. Materialize to Table
# Only NOW does it actually load data into RAM.
result_table = scanner.to_table()
print("\n--- Filtered Results ---")
print(result_table.to_pandas())
# Cleanup
shutil.rmtree('my_big_dataset')Pyarrow + DuckDB
What is Arrow-ADBC?
Last updated