From 750353705ddbd29cc193bb0909628d319b9b174b Mon Sep 17 00:00:00 2001 From: Gabe Small Date: Thu, 3 Apr 2025 10:50:56 -0500 Subject: [PATCH 1/7] BUG: Fix #61221: Exception with unstack(sort=False) and NA in index. --- pandas/core/reshape/reshape.py | 22 ++++++- pandas/tests/frame/test_stack_unstack.py | 78 ++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c60fe71a7ff28..204282998c095 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -134,6 +134,11 @@ def __init__( self.removed_level_full = index.levels[self.level] if not self.sort: unique_codes = unique(self.index.codes[self.level]) + # Bug Fix GH 61221 + # The -1 in the unsorted unique codes causes for doubling and an eventual ValueError + # saving the NA location to be used in the repeater + self.na = np.where(unique_codes == -1)[0][0] if -1 in unique_codes else None + unique_codes = unique_codes[unique_codes != -1] self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) @@ -381,11 +386,22 @@ def _repeater(self) -> np.ndarray: # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: - repeater = np.insert(repeater, 0, -1) + if not self.sort and self.na: + repeater = np.insert(repeater, self.na, -1) + else: + repeater = np.insert(repeater, 0, -1) else: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.lift - repeater = np.arange(stride) - self.lift + if self.sort or not self.na: + repeater = np.arange(stride) - self.lift + else : + #move the -1 to the position at self.na + repeater = np.arange(stride) + if(self.na): + repeater[self.na] = -1 + if(self.na + 1) < len(repeater): + repeater[self.na + 1:] -= 1 return repeater @@ -1049,7 +1065,7 @@ def stack_reshape( else: data.columns = default_index(len(data.columns)) buf.append(data) - + if len(buf) > 0 and not frame.empty: result = concat(buf, ignore_index=True) else: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 22fdfd3a01408..83394ad64bdd4 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1605,6 +1605,84 @@ def test_stack_sort_false(future_stack): tm.assert_frame_equal(result, expected) +def assert_na_safe_equal(left, right): + """Compare DataFrames ignoring NA type differences""" + left = left.rename(columns={pd.NA: np.nan}, level=1) + right = right.rename(columns={pd.NA: np.nan}, level=1) + tm.assert_frame_equal(left, right, check_dtype=False) + +def test_unstack_sort_false_na(): + # GH 61221 + levels1 = ['b','a'] + levels2 = pd.Index([1, 2, 3, pd.NA], dtype=pd.Int64Dtype()) + index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) + df = pd.DataFrame(dict(value=range(len(index))), index=index) + result = df.unstack(level='level2', sort=False) + expected = pd.DataFrame( + { + ('value', 1): [0, 4], + ('value', 2): [1, 5], + ('value', 3): [2, 6], + ('value', pd.Int64Dtype().na_value): [3, 7] + }, + index=pd.Index(['b', 'a'], name='level1'), + columns=pd.MultiIndex.from_tuples([ + ('value', 1), ('value', 2), ('value', 3), ('value', pd.Int64Dtype().na_value) + ], names=[None, 'level2']) + ) + assert_na_safe_equal(result, expected) + levels2 = pd.Index([pd.NA, 1, 2, 3], dtype=pd.Int64Dtype()) + index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) + df = pd.DataFrame(dict(value=range(len(index))), index=index) + result = df.unstack(level='level2', sort=False) + expected = pd.DataFrame( + { + ('value', pd.Int64Dtype().na_value): [0, 4], + ('value', 1): [1, 5], + ('value', 2): [2, 6], + ('value', 3): [3, 7] # Use actual pd.NA object + }, + index=pd.Index(['b', 'a'], name='level1'), + columns=pd.MultiIndex.from_tuples([ + ('value', pd.Int64Dtype().na_value), ('value', 1), ('value', 2), ('value', 3) + ], names=[None, 'level2']) + ) + assert_na_safe_equal(result, expected) + levels2 = pd.Index([ 1, pd.NA, 2, 3], dtype=pd.Int64Dtype()) + index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) + df = pd.DataFrame(dict(value=range(len(index))), index=index) + result = df.unstack(level='level2', sort=False) + expected = pd.DataFrame( + { + ('value', 1): [0, 4], + ('value', pd.Int64Dtype().na_value): [1, 5], + ('value', 2): [2, 6], + ('value', 3): [3, 7] # Use actual pd.NA object + }, + index=pd.Index(['b', 'a'], name='level1'), + columns=pd.MultiIndex.from_tuples([ + ('value', 1), ('value', pd.Int64Dtype().na_value), ('value', 2), ('value', 3) + ], names=[None, 'level2']) + ) + assert_na_safe_equal(result, expected) + levels2 = pd.Index([3, pd.NA, 1, 2], dtype=pd.Int64Dtype()) + index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) + df = pd.DataFrame(dict(value=range(len(index))), index=index) + result = df.unstack(level='level2', sort=False) + expected = pd.DataFrame( + { + ('value', 3): [0, 4], + ('value', pd.Int64Dtype().na_value): [1, 5], + ('value', 1): [2, 6], + ('value', 2): [3, 7] # Use actual pd.NA object + }, + index=pd.Index(['b', 'a'], name='level1'), + columns=pd.MultiIndex.from_tuples([ + ('value', 3), ('value', pd.Int64Dtype().na_value), ('value', 1), ('value', 2) + ], names=[None, 'level2']) + ) + assert_na_safe_equal(result, expected) + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_sort_false_multi_level(future_stack): # GH 15105 From c0a7c806494199417f29044292fb24f292404f09 Mon Sep 17 00:00:00 2001 From: Gabe Small Date: Thu, 3 Apr 2025 11:03:25 -0500 Subject: [PATCH 2/7] BUG: Fix #61221: Exception with unstack(sort=False) and NA in index. --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e6fafc8b1b14c..618eb3194d248 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -798,6 +798,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) +- Bug in :meth:`DataFrame.unstack` where when sort is False, in frames with NA columns, unstacking causing errors or improper orders (:issue:`61221`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) Sparse From 7a8fddb6656c73ff785303efd65c185c715829f5 Mon Sep 17 00:00:00 2001 From: Gabe Small Date: Thu, 3 Apr 2025 11:15:48 -0500 Subject: [PATCH 3/7] fixed formatting --- pandas/core/reshape/reshape.py | 16 +-- pandas/tests/frame/test_stack_unstack.py | 158 +++++++++++++---------- 2 files changed, 100 insertions(+), 74 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 204282998c095..bf40dda63cc9f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -135,10 +135,10 @@ def __init__( if not self.sort: unique_codes = unique(self.index.codes[self.level]) # Bug Fix GH 61221 - # The -1 in the unsorted unique codes causes for doubling and an eventual ValueError + # The -1 in the unsorted unique codes causes for errors # saving the NA location to be used in the repeater self.na = np.where(unique_codes == -1)[0][0] if -1 in unique_codes else None - unique_codes = unique_codes[unique_codes != -1] + unique_codes = unique_codes[unique_codes != -1] self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) @@ -395,13 +395,13 @@ def _repeater(self) -> np.ndarray: stride = len(self.removed_level) + self.lift if self.sort or not self.na: repeater = np.arange(stride) - self.lift - else : - #move the -1 to the position at self.na + else: + # move the -1 to the position at self.na repeater = np.arange(stride) - if(self.na): + if self.na: repeater[self.na] = -1 - if(self.na + 1) < len(repeater): - repeater[self.na + 1:] -= 1 + if (self.na + 1) < len(repeater): + repeater[self.na + 1 :] -= 1 return repeater @@ -1065,7 +1065,7 @@ def stack_reshape( else: data.columns = default_index(len(data.columns)) buf.append(data) - + if len(buf) > 0 and not frame.empty: result = concat(buf, ignore_index=True) else: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 83394ad64bdd4..617e10c5300be 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1610,79 +1610,105 @@ def assert_na_safe_equal(left, right): left = left.rename(columns={pd.NA: np.nan}, level=1) right = right.rename(columns={pd.NA: np.nan}, level=1) tm.assert_frame_equal(left, right, check_dtype=False) - + + def test_unstack_sort_false_na(): # GH 61221 - levels1 = ['b','a'] - levels2 = pd.Index([1, 2, 3, pd.NA], dtype=pd.Int64Dtype()) - index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) - df = pd.DataFrame(dict(value=range(len(index))), index=index) - result = df.unstack(level='level2', sort=False) - expected = pd.DataFrame( - { - ('value', 1): [0, 4], - ('value', 2): [1, 5], - ('value', 3): [2, 6], - ('value', pd.Int64Dtype().na_value): [3, 7] - }, - index=pd.Index(['b', 'a'], name='level1'), - columns=pd.MultiIndex.from_tuples([ - ('value', 1), ('value', 2), ('value', 3), ('value', pd.Int64Dtype().na_value) - ], names=[None, 'level2']) - ) + levels1 = ["b", "a"] + levels2 = Index([1, 2, 3, pd.NA], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 1): [0, 4], + ("value", 2): [1, 5], + ("value", 3): [2, 6], + ("value", pd.Int64Dtype().na_value): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", 1), + ("value", 2), + ("value", 3), + ("value", pd.Int64Dtype().na_value), + ], + names=[None, "level2"], + ), + ) assert_na_safe_equal(result, expected) - levels2 = pd.Index([pd.NA, 1, 2, 3], dtype=pd.Int64Dtype()) - index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) - df = pd.DataFrame(dict(value=range(len(index))), index=index) - result = df.unstack(level='level2', sort=False) - expected = pd.DataFrame( - { - ('value', pd.Int64Dtype().na_value): [0, 4], - ('value', 1): [1, 5], - ('value', 2): [2, 6], - ('value', 3): [3, 7] # Use actual pd.NA object - }, - index=pd.Index(['b', 'a'], name='level1'), - columns=pd.MultiIndex.from_tuples([ - ('value', pd.Int64Dtype().na_value), ('value', 1), ('value', 2), ('value', 3) - ], names=[None, 'level2']) - ) + levels2 = Index([pd.NA, 1, 2, 3], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", pd.Int64Dtype().na_value): [0, 4], + ("value", 1): [1, 5], + ("value", 2): [2, 6], + ("value", 3): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", pd.Int64Dtype().na_value), + ("value", 1), + ("value", 2), + ("value", 3), + ], + names=[None, "level2"], + ), + ) assert_na_safe_equal(result, expected) - levels2 = pd.Index([ 1, pd.NA, 2, 3], dtype=pd.Int64Dtype()) - index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) - df = pd.DataFrame(dict(value=range(len(index))), index=index) - result = df.unstack(level='level2', sort=False) - expected = pd.DataFrame( - { - ('value', 1): [0, 4], - ('value', pd.Int64Dtype().na_value): [1, 5], - ('value', 2): [2, 6], - ('value', 3): [3, 7] # Use actual pd.NA object - }, - index=pd.Index(['b', 'a'], name='level1'), - columns=pd.MultiIndex.from_tuples([ - ('value', 1), ('value', pd.Int64Dtype().na_value), ('value', 2), ('value', 3) - ], names=[None, 'level2']) - ) + levels2 = Index([1, pd.NA, 2, 3], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 1): [0, 4], + ("value", pd.Int64Dtype().na_value): [1, 5], + ("value", 2): [2, 6], + ("value", 3): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", 1), + ("value", pd.Int64Dtype().na_value), + ("value", 2), + ("value", 3), + ], + names=[None, "level2"], + ), + ) assert_na_safe_equal(result, expected) - levels2 = pd.Index([3, pd.NA, 1, 2], dtype=pd.Int64Dtype()) - index = pd.MultiIndex.from_product([levels1, levels2], names=['level1', 'level2']) - df = pd.DataFrame(dict(value=range(len(index))), index=index) - result = df.unstack(level='level2', sort=False) - expected = pd.DataFrame( - { - ('value', 3): [0, 4], - ('value', pd.Int64Dtype().na_value): [1, 5], - ('value', 1): [2, 6], - ('value', 2): [3, 7] # Use actual pd.NA object - }, - index=pd.Index(['b', 'a'], name='level1'), - columns=pd.MultiIndex.from_tuples([ - ('value', 3), ('value', pd.Int64Dtype().na_value), ('value', 1), ('value', 2) - ], names=[None, 'level2']) - ) + levels2 = Index([3, pd.NA, 1, 2], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 3): [0, 4], + ("value", pd.Int64Dtype().na_value): [1, 5], + ("value", 1): [2, 6], + ("value", 2): [3, 7], # Use actual pd.NA object + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", 3), + ("value", pd.Int64Dtype().na_value), + ("value", 1), + ("value", 2), + ], + names=[None, "level2"], + ), + ) assert_na_safe_equal(result, expected) + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_sort_false_multi_level(future_stack): # GH 15105 From a39746647eb85d34aa03401c1596d4ce4cbd3c90 Mon Sep 17 00:00:00 2001 From: Gabe Small Date: Thu, 3 Apr 2025 13:46:16 -0500 Subject: [PATCH 4/7] fixed issue with unsorted unstack, should now work --- pandas/core/reshape/reshape.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index bf40dda63cc9f..515e729e1baa2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -175,7 +175,13 @@ def _indexer_and_to_sort( codes = list(self.index.codes) if not self.sort: # Create new codes considering that labels are already sorted - codes = [factorize(code)[0] for code in codes] + # Make sure to preserve the -1 values before factorizing + codes = [] + for code in self.index.codes: + mask = code != -1 + factorized = np.full_like(code, -1) + factorized[mask] = factorize(code[mask])[0] + codes.append(factorized) levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -194,9 +200,15 @@ def sorted_labels(self) -> list[np.ndarray]: return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - indexer, _ = self._indexer_and_to_sort - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values + if self.sort: + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values + level_sizes = tuple(len(level) for level in self.new_index_levels) + group_ids = get_group_index( + self.sorted_labels[:-1], level_sizes, sort=False, xnull=False + ) + return values[np.argsort(group_ids, kind="mergesort")] def _make_selectors(self) -> None: new_levels = self.new_index_levels @@ -581,7 +593,6 @@ def _unstack_frame( unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor, sort=sort ) - if not obj._can_fast_transpose: mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) return obj._constructor_from_mgr(mgr, axes=mgr.axes) From 3539ac6e88d53a2326516ddf2dc45f2d0d4375bb Mon Sep 17 00:00:00 2001 From: Gabe Small Date: Fri, 4 Apr 2025 12:53:42 -0500 Subject: [PATCH 5/7] Instead of creating variable self.na, constructed na index locally --- pandas/core/reshape/reshape.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 515e729e1baa2..bc03455c53d97 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -137,7 +137,6 @@ def __init__( # Bug Fix GH 61221 # The -1 in the unsorted unique codes causes for errors # saving the NA location to be used in the repeater - self.na = np.where(unique_codes == -1)[0][0] if -1 in unique_codes else None unique_codes = unique_codes[unique_codes != -1] self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) @@ -398,22 +397,22 @@ def _repeater(self) -> np.ndarray: # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: - if not self.sort and self.na: - repeater = np.insert(repeater, self.na, -1) - else: - repeater = np.insert(repeater, 0, -1) + na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] + repeater = np.insert(repeater, na_index, -1) + else: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.lift - if self.sort or not self.na: + if self.sort or not self.lift: repeater = np.arange(stride) - self.lift else: - # move the -1 to the position at self.na + # move the -1 to the position at na_index + na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] repeater = np.arange(stride) - if self.na: - repeater[self.na] = -1 - if (self.na + 1) < len(repeater): - repeater[self.na + 1 :] -= 1 + if na_index: + repeater[na_index] = -1 + if (na_index + 1) < len(repeater): + repeater[na_index + 1 :] -= 1 return repeater From 64f5173f493dbbcc914fc6a67618b9e3519a7887 Mon Sep 17 00:00:00 2001 From: Gabe Small Date: Fri, 4 Apr 2025 15:19:11 -0500 Subject: [PATCH 6/7] fixed issues with local variable --- pandas/core/reshape/reshape.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index bc03455c53d97..16d56ce7c8e01 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -397,22 +397,22 @@ def _repeater(self) -> np.ndarray: # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: - na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] - repeater = np.insert(repeater, na_index, -1) - + if not self.sort: + na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] + repeater = np.insert(repeater, na_index, -1) + else: + repeater = np.insert(repeater, 0, -1) else: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.lift if self.sort or not self.lift: - repeater = np.arange(stride) - self.lift + repeater = np.arange(stride) else: - # move the -1 to the position at na_index na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] - repeater = np.arange(stride) - if na_index: - repeater[na_index] = -1 - if (na_index + 1) < len(repeater): - repeater[na_index + 1 :] -= 1 + repeater = np.arange(stride) - self.lift + if self.na: + repeater[self.na] = -1 + repeater[: self.na] += 1 return repeater From e2b38b12e3d0898f8f2babc4bd183459198c29d6 Mon Sep 17 00:00:00 2001 From: Gabe Small Date: Fri, 4 Apr 2025 15:41:33 -0500 Subject: [PATCH 7/7] fixed the fix -oops --- pandas/core/reshape/reshape.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 16d56ce7c8e01..6ea6d802d0b90 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -406,13 +406,13 @@ def _repeater(self) -> np.ndarray: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.lift if self.sort or not self.lift: - repeater = np.arange(stride) + repeater = np.arange(stride) - self.lift else: na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] repeater = np.arange(stride) - self.lift - if self.na: - repeater[self.na] = -1 - repeater[: self.na] += 1 + if na_index: + repeater[na_index] = -1 + repeater[:na_index] += 1 return repeater