From f18dadd7093cbed66ee42738d6564950168d3fe3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 8 Jan 2025 09:02:23 -0500 Subject: [PATCH] Document the `ParquetRecordBatchStream` buffering (#6947) * Document the ParquetRecordBatchStream buffering * Update parquet/src/arrow/async_reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/async_reader/mod.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 4f3befe4266..5323251b07e 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -611,11 +611,23 @@ impl std::fmt::Debug for StreamState { } } -/// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] -/// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. +/// An asynchronous [`Stream`]of [`RecordBatch`] constructed using [`ParquetRecordBatchStreamBuilder`] to read parquet files. /// /// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups, /// allowing users to decode record batches separately from I/O. +/// +/// # I/O Buffering +/// +/// `ParquetRecordBatchStream` buffers *all* data pages selected after predicates +/// (projection + filtering, etc) and decodes the rows from those buffered pages. +/// +/// For example, if all rows and columns are selected, the entire row group is +/// buffered in memory during decode. This minimizes the number of IO operations +/// required, which is especially important for object stores, where IO operations +/// have latencies in the hundreds of milliseconds +/// +/// +/// [`Stream`]: https://docs.rs/futures/latest/futures/stream/trait.Stream.html pub struct ParquetRecordBatchStream { metadata: Arc,