|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 |
|
| 15 | +import numpy as np |
15 | 16 | import pandas as pd
|
16 | 17 |
|
17 | 18 | try:
|
|
27 | 28 | from ..arrays import ArrowListArray, ArrowListDtype
|
28 | 29 | from .core import DataFrameReductionOperand, DataFrameReductionMixin, CustomReduction
|
29 | 30 |
|
| 31 | +cp = lazy_import("cupy", globals=globals(), rename="cp") |
30 | 32 | cudf = lazy_import("cudf", globals=globals())
|
31 | 33 |
|
32 | 34 |
|
33 | 35 | class NuniqueReduction(CustomReduction):
|
34 | 36 | pre_with_agg = True
|
| 37 | + post_with_agg = True |
35 | 38 |
|
36 | 39 | def __init__(
|
37 |
| - self, name="unique", axis=0, dropna=True, use_arrow_dtype=False, is_gpu=False |
| 40 | + self, name="nunique", axis=0, dropna=True, use_arrow_dtype=False, is_gpu=False |
38 | 41 | ):
|
39 | 42 | super().__init__(name, is_gpu=is_gpu)
|
40 | 43 | self._axis = axis
|
41 | 44 | self._dropna = dropna
|
42 | 45 | self._use_arrow_dtype = use_arrow_dtype
|
43 | 46 |
|
44 |
| - def _drop_duplicates(self, xdf, value, explode=False): |
| 47 | + def _get_modules(self): |
| 48 | + if not self.is_gpu(): |
| 49 | + return np, pd |
| 50 | + else: # pragma: no cover |
| 51 | + return cp, cudf |
| 52 | + |
| 53 | + def _drop_duplicates(self, value, explode=False, agg=False): |
| 54 | + xp, xdf = self._get_modules() |
| 55 | + if self._use_arrow_dtype and xp is not cp and hasattr(value, "to_numpy"): |
| 56 | + value = value.to_numpy() |
| 57 | + else: |
| 58 | + value = value.values |
| 59 | + |
45 | 60 | if explode:
|
46 |
| - value = value.explode() |
| 61 | + value = xp.concatenate(value) |
47 | 62 |
|
48 |
| - if not self._use_arrow_dtype or xdf is cudf: |
49 |
| - return [value.drop_duplicates().to_numpy()] |
| 63 | + value = xdf.unique(value) |
| 64 | + |
| 65 | + if not agg: |
| 66 | + if not self._use_arrow_dtype or xp is cp: |
| 67 | + return [value] |
| 68 | + else: |
| 69 | + try: |
| 70 | + return ArrowListArray([value]) |
| 71 | + except pa.ArrowInvalid: |
| 72 | + # fallback due to diverse dtypes |
| 73 | + return [value] |
50 | 74 | else:
|
51 |
| - try: |
52 |
| - return ArrowListArray([value.drop_duplicates().to_numpy()]) |
53 |
| - except pa.ArrowInvalid: |
54 |
| - # fallback due to diverse dtypes |
55 |
| - return [value.drop_duplicates().to_numpy()] |
| 75 | + if self._dropna: |
| 76 | + return xp.sum(xdf.notna(value)) |
| 77 | + return len(value) |
56 | 78 |
|
57 | 79 | def pre(self, in_data): # noqa: W0221 # pylint: disable=arguments-differ
|
58 |
| - xdf = cudf if self.is_gpu() else pd |
| 80 | + xp, xdf = self._get_modules() |
59 | 81 | if isinstance(in_data, xdf.Series):
|
60 |
| - unique_values = self._drop_duplicates(xdf, in_data) |
61 |
| - return xdf.Series(unique_values, name=in_data.name) |
| 82 | + unique_values = self._drop_duplicates(in_data) |
| 83 | + return xdf.Series(unique_values, name=in_data.name, dtype=object) |
62 | 84 | else:
|
63 | 85 | if self._axis == 0:
|
64 | 86 | data = dict()
|
65 | 87 | for d, v in in_data.iteritems():
|
66 |
| - data[d] = self._drop_duplicates(xdf, v) |
67 |
| - df = xdf.DataFrame(data) |
| 88 | + data[d] = self._drop_duplicates(v) |
| 89 | + df = xdf.DataFrame(data, copy=False, dtype=object) |
68 | 90 | else:
|
69 | 91 | df = xdf.DataFrame(columns=[0])
|
70 | 92 | for d, v in in_data.iterrows():
|
71 |
| - df.loc[d] = self._drop_duplicates(xdf, v) |
| 93 | + df.loc[d] = self._drop_duplicates(v) |
72 | 94 | return df
|
73 | 95 |
|
74 | 96 | def agg(self, in_data): # noqa: W0221 # pylint: disable=arguments-differ
|
75 |
| - xdf = cudf if self.is_gpu() else pd |
| 97 | + xp, xdf = self._get_modules() |
76 | 98 | if isinstance(in_data, xdf.Series):
|
77 |
| - unique_values = self._drop_duplicates(xdf, in_data, explode=True) |
78 |
| - return xdf.Series(unique_values, name=in_data.name) |
| 99 | + unique_values = self._drop_duplicates(in_data, explode=True) |
| 100 | + return xdf.Series(unique_values, name=in_data.name, dtype=object) |
79 | 101 | else:
|
80 | 102 | if self._axis == 0:
|
81 | 103 | data = dict()
|
82 | 104 | for d, v in in_data.iteritems():
|
83 |
| - if self._use_arrow_dtype and xdf is not cudf: |
84 |
| - v = pd.Series(v.to_numpy()) |
85 |
| - data[d] = self._drop_duplicates(xdf, v, explode=True) |
86 |
| - df = xdf.DataFrame(data) |
| 105 | + data[d] = self._drop_duplicates(v, explode=True) |
| 106 | + df = xdf.DataFrame(data, copy=False, dtype=object) |
87 | 107 | else:
|
88 | 108 | df = xdf.DataFrame(columns=[0])
|
89 | 109 | for d, v in in_data.iterrows():
|
90 |
| - df.loc[d] = self._drop_duplicates(xdf, v, explode=True) |
| 110 | + df.loc[d] = self._drop_duplicates(v, explode=True) |
91 | 111 | return df
|
92 | 112 |
|
93 | 113 | def post(self, in_data): # noqa: W0221 # pylint: disable=arguments-differ
|
94 |
| - xdf = cudf if self.is_gpu() else pd |
| 114 | + xp, xdf = self._get_modules() |
95 | 115 | if isinstance(in_data, xdf.Series):
|
96 |
| - return in_data.explode().nunique(dropna=self._dropna) |
| 116 | + return self._drop_duplicates(in_data, explode=True, agg=True) |
97 | 117 | else:
|
98 | 118 | in_data_iter = (
|
99 | 119 | in_data.iteritems() if self._axis == 0 else in_data.iterrows()
|
100 | 120 | )
|
101 | 121 | data = dict()
|
102 | 122 | for d, v in in_data_iter:
|
103 |
| - if isinstance(v.dtype, ArrowListDtype): |
104 |
| - v = xdf.Series(v.to_numpy()) |
105 |
| - data[d] = v.explode().nunique(dropna=self._dropna) |
| 123 | + data[d] = self._drop_duplicates(v, explode=True, agg=True) |
106 | 124 | return xdf.Series(data)
|
107 | 125 |
|
108 | 126 |
|
|
0 commit comments