Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix #61221: Exception with unstack(sort=False) and NA in index. #61226

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@ Reshaping
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
- Bug in :meth:`DataFrame.unstack` where when sort is False, in frames with NA columns, unstacking causing errors or improper orders (:issue:`61221`)
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)

Sparse
Expand Down
40 changes: 33 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def __init__(
self.removed_level_full = index.levels[self.level]
if not self.sort:
unique_codes = unique(self.index.codes[self.level])
# Bug Fix GH 61221
# The -1 in the unsorted unique codes causes for errors
# saving the NA location to be used in the repeater
unique_codes = unique_codes[unique_codes != -1]
self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)

Expand Down Expand Up @@ -170,7 +174,13 @@ def _indexer_and_to_sort(
codes = list(self.index.codes)
if not self.sort:
# Create new codes considering that labels are already sorted
codes = [factorize(code)[0] for code in codes]
# Make sure to preserve the -1 values before factorizing
codes = []
for code in self.index.codes:
mask = code != -1
factorized = np.full_like(code, -1)
factorized[mask] = factorize(code[mask])[0]
codes.append(factorized)
levs = list(self.index.levels)
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
Expand All @@ -189,9 +199,15 @@ def sorted_labels(self) -> list[np.ndarray]:
return to_sort

def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
if self.sort:
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
level_sizes = tuple(len(level) for level in self.new_index_levels)
group_ids = get_group_index(
self.sorted_labels[:-1], level_sizes, sort=False, xnull=False
)
return values[np.argsort(group_ids, kind="mergesort")]

def _make_selectors(self) -> None:
new_levels = self.new_index_levels
Expand Down Expand Up @@ -381,11 +397,22 @@ def _repeater(self) -> np.ndarray:
# In this case, we remap the new codes to the original level:
repeater = self.removed_level_full.get_indexer(self.removed_level)
if self.lift:
repeater = np.insert(repeater, 0, -1)
if not self.sort:
na_index = (self.index.codes[self.level] == -1).nonzero()[0][0]
repeater = np.insert(repeater, na_index, -1)
else:
repeater = np.insert(repeater, 0, -1)
else:
# Otherwise, we just use each level item exactly once:
stride = len(self.removed_level) + self.lift
repeater = np.arange(stride) - self.lift
if self.sort or not self.lift:
repeater = np.arange(stride) - self.lift
else:
na_index = (self.index.codes[self.level] == -1).nonzero()[0][0]
repeater = np.arange(stride) - self.lift
if na_index:
repeater[na_index] = -1
repeater[:na_index] += 1

return repeater

Expand Down Expand Up @@ -565,7 +592,6 @@ def _unstack_frame(
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor, sort=sort
)

if not obj._can_fast_transpose:
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
Expand Down
104 changes: 104 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,110 @@ def test_stack_sort_false(future_stack):
tm.assert_frame_equal(result, expected)


def assert_na_safe_equal(left, right):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We shouldn't need this function. Ideally the same NA values should be in the input and output

Copy link
Author

@gsmll gsmll Apr 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea, thats to my own failures, I couldn't figure out how to create a dataframe with values as they just got converted into Nan. If you know how to create the proper dataframe for the expected, I can remove that.

Do you have any tips on how i can properly create the expected DF?

"""Compare DataFrames ignoring NA type differences"""
left = left.rename(columns={pd.NA: np.nan}, level=1)
right = right.rename(columns={pd.NA: np.nan}, level=1)
tm.assert_frame_equal(left, right, check_dtype=False)


def test_unstack_sort_false_na():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's multiple test cases here, it's best to split each assertion to it's own test.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll do that later as well

# GH 61221
levels1 = ["b", "a"]
levels2 = Index([1, 2, 3, pd.NA], dtype=pd.Int64Dtype())
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 1): [0, 4],
("value", 2): [1, 5],
("value", 3): [2, 6],
("value", pd.Int64Dtype().na_value): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[
("value", 1),
("value", 2),
("value", 3),
("value", pd.Int64Dtype().na_value),
],
names=[None, "level2"],
),
)
assert_na_safe_equal(result, expected)
levels2 = Index([pd.NA, 1, 2, 3], dtype=pd.Int64Dtype())
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", pd.Int64Dtype().na_value): [0, 4],
("value", 1): [1, 5],
("value", 2): [2, 6],
("value", 3): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[
("value", pd.Int64Dtype().na_value),
("value", 1),
("value", 2),
("value", 3),
],
names=[None, "level2"],
),
)
assert_na_safe_equal(result, expected)
levels2 = Index([1, pd.NA, 2, 3], dtype=pd.Int64Dtype())
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 1): [0, 4],
("value", pd.Int64Dtype().na_value): [1, 5],
("value", 2): [2, 6],
("value", 3): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[
("value", 1),
("value", pd.Int64Dtype().na_value),
("value", 2),
("value", 3),
],
names=[None, "level2"],
),
)
assert_na_safe_equal(result, expected)
levels2 = Index([3, pd.NA, 1, 2], dtype=pd.Int64Dtype())
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 3): [0, 4],
("value", pd.Int64Dtype().na_value): [1, 5],
("value", 1): [2, 6],
("value", 2): [3, 7], # Use actual pd.NA object
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[
("value", 3),
("value", pd.Int64Dtype().na_value),
("value", 1),
("value", 2),
],
names=[None, "level2"],
),
)
assert_na_safe_equal(result, expected)


@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
def test_stack_sort_false_multi_level(future_stack):
# GH 15105
Expand Down
Loading