diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e6fafc8b1b14c..618eb3194d248 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -798,6 +798,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) +- Bug in :meth:`DataFrame.unstack` where when sort is False, in frames with NA columns, unstacking causing errors or improper orders (:issue:`61221`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) Sparse diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c60fe71a7ff28..6ea6d802d0b90 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -134,6 +134,10 @@ def __init__( self.removed_level_full = index.levels[self.level] if not self.sort: unique_codes = unique(self.index.codes[self.level]) + # Bug Fix GH 61221 + # The -1 in the unsorted unique codes causes for errors + # saving the NA location to be used in the repeater + unique_codes = unique_codes[unique_codes != -1] self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) @@ -170,7 +174,13 @@ def _indexer_and_to_sort( codes = list(self.index.codes) if not self.sort: # Create new codes considering that labels are already sorted - codes = [factorize(code)[0] for code in codes] + # Make sure to preserve the -1 values before factorizing + codes = [] + for code in self.index.codes: + mask = code != -1 + factorized = np.full_like(code, -1) + factorized[mask] = factorize(code[mask])[0] + codes.append(factorized) levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -189,9 +199,15 @@ def sorted_labels(self) -> list[np.ndarray]: return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - indexer, _ = self._indexer_and_to_sort - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values + if self.sort: + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values + level_sizes = tuple(len(level) for level in self.new_index_levels) + group_ids = get_group_index( + self.sorted_labels[:-1], level_sizes, sort=False, xnull=False + ) + return values[np.argsort(group_ids, kind="mergesort")] def _make_selectors(self) -> None: new_levels = self.new_index_levels @@ -381,11 +397,22 @@ def _repeater(self) -> np.ndarray: # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: - repeater = np.insert(repeater, 0, -1) + if not self.sort: + na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] + repeater = np.insert(repeater, na_index, -1) + else: + repeater = np.insert(repeater, 0, -1) else: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.lift - repeater = np.arange(stride) - self.lift + if self.sort or not self.lift: + repeater = np.arange(stride) - self.lift + else: + na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] + repeater = np.arange(stride) - self.lift + if na_index: + repeater[na_index] = -1 + repeater[:na_index] += 1 return repeater @@ -565,7 +592,6 @@ def _unstack_frame( unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor, sort=sort ) - if not obj._can_fast_transpose: mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) return obj._constructor_from_mgr(mgr, axes=mgr.axes) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 22fdfd3a01408..617e10c5300be 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1605,6 +1605,110 @@ def test_stack_sort_false(future_stack): tm.assert_frame_equal(result, expected) +def assert_na_safe_equal(left, right): + """Compare DataFrames ignoring NA type differences""" + left = left.rename(columns={pd.NA: np.nan}, level=1) + right = right.rename(columns={pd.NA: np.nan}, level=1) + tm.assert_frame_equal(left, right, check_dtype=False) + + +def test_unstack_sort_false_na(): + # GH 61221 + levels1 = ["b", "a"] + levels2 = Index([1, 2, 3, pd.NA], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 1): [0, 4], + ("value", 2): [1, 5], + ("value", 3): [2, 6], + ("value", pd.Int64Dtype().na_value): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", 1), + ("value", 2), + ("value", 3), + ("value", pd.Int64Dtype().na_value), + ], + names=[None, "level2"], + ), + ) + assert_na_safe_equal(result, expected) + levels2 = Index([pd.NA, 1, 2, 3], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", pd.Int64Dtype().na_value): [0, 4], + ("value", 1): [1, 5], + ("value", 2): [2, 6], + ("value", 3): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", pd.Int64Dtype().na_value), + ("value", 1), + ("value", 2), + ("value", 3), + ], + names=[None, "level2"], + ), + ) + assert_na_safe_equal(result, expected) + levels2 = Index([1, pd.NA, 2, 3], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 1): [0, 4], + ("value", pd.Int64Dtype().na_value): [1, 5], + ("value", 2): [2, 6], + ("value", 3): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", 1), + ("value", pd.Int64Dtype().na_value), + ("value", 2), + ("value", 3), + ], + names=[None, "level2"], + ), + ) + assert_na_safe_equal(result, expected) + levels2 = Index([3, pd.NA, 1, 2], dtype=pd.Int64Dtype()) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 3): [0, 4], + ("value", pd.Int64Dtype().na_value): [1, 5], + ("value", 1): [2, 6], + ("value", 2): [3, 7], # Use actual pd.NA object + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [ + ("value", 3), + ("value", pd.Int64Dtype().na_value), + ("value", 1), + ("value", 2), + ], + names=[None, "level2"], + ), + ) + assert_na_safe_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_sort_false_multi_level(future_stack): # GH 15105