From 7aecc3f14d07a4d627ea6225737cf76bc6e4ab3c Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Sat, 11 Jan 2025 22:45:08 +1300 Subject: [PATCH] [Parquet] Improve speed of dictionary encoding NaN float values (#6953) * Treat NaNs equal to NaN when interning for dictionary encoding * Compare all values by bytes rather than adding Intern trait --- parquet/src/util/interner.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index 489d4d58122c..34c7d1390f7a 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -24,7 +24,7 @@ const DEFAULT_DEDUP_CAPACITY: usize = 4096; pub trait Storage { type Key: Copy; - type Value: AsBytes + PartialEq + ?Sized; + type Value: AsBytes + ?Sized; /// Gets an element by its key fn get(&self, idx: Self::Key) -> &Self::Value; @@ -66,7 +66,8 @@ impl Interner { .dedup .entry( hash, - |index| value == self.storage.get(*index), + // Compare bytes rather than directly comparing values so NaNs can be interned + |index| value.as_bytes() == self.storage.get(*index).as_bytes(), |key| self.state.hash_one(self.storage.get(*key).as_bytes()), ) .or_insert_with(|| self.storage.push(value))