From f18dadd7093cbed66ee42738d6564950168d3fe3 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 8 Jan 2025 09:02:23 -0500
Subject: [PATCH] Document the `ParquetRecordBatchStream` buffering (#6947)

* Document the ParquetRecordBatchStream buffering

* Update parquet/src/arrow/async_reader/mod.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

---------

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
---
 parquet/src/arrow/async_reader/mod.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs
index 4f3befe4266..5323251b07e 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -611,11 +611,23 @@ impl<T> std::fmt::Debug for StreamState<T> {
     }
 }
 
-/// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`]
-/// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`].
+/// An asynchronous [`Stream`]of [`RecordBatch`] constructed using [`ParquetRecordBatchStreamBuilder`] to read parquet files.
 ///
 /// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups,
 /// allowing users to decode record batches separately from I/O.
+///
+/// # I/O Buffering
+///
+/// `ParquetRecordBatchStream` buffers *all* data pages selected after predicates
+/// (projection + filtering, etc) and decodes the rows from those buffered pages.
+///
+/// For example, if all rows and columns are selected, the entire row group is
+/// buffered in memory during decode. This minimizes the number of IO operations
+/// required, which is especially important for object stores, where IO operations
+/// have latencies in the hundreds of milliseconds
+///
+///
+/// [`Stream`]: https://docs.rs/futures/latest/futures/stream/trait.Stream.html
 pub struct ParquetRecordBatchStream<T> {
     metadata: Arc<ParquetMetaData>,