diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py index 0d9b22ae3f9e..813ba8c50109 100644 --- a/sdks/python/apache_beam/dataframe/frames.py +++ b/sdks/python/apache_beam/dataframe/frames.py @@ -2341,8 +2341,13 @@ def value_counts( result = column.groupby(column, dropna=dropna).size() - # groupby.size() names the index, which we don't need - result.index.name = None + # Pandas 2 introduces new naming for the results. + if PD_VERSION >= (2, 0): + result.index.name = getattr(self, "name", None) + result.name = "proportion" if normalize else "count" + else: + # groupby.size() names the index, which we don't need + result.index.name = None if normalize: return result / column.length() @@ -3994,12 +3999,18 @@ def value_counts(self, subset=None, sort=False, normalize=False, columns = subset or list(self.columns) if dropna: - dropped = self.dropna() + # Must include subset here because otherwise we spuriously drop NAs due + # to columns outside our subset. + dropped = self.dropna(subset=subset) else: dropped = self result = dropped.groupby(columns, dropna=dropna).size() + # Pandas 2 introduces new naming for the results. + if PD_VERSION >= (2,0): + result.name = "proportion" if normalize else "count" + if normalize: return result/dropped.length() else: diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index fa121aa85c30..5ef2af2892d2 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -688,6 +688,8 @@ def test_value_counts_with_nans(self): self._run_test(lambda df: df.value_counts(), df) self._run_test(lambda df: df.value_counts(normalize=True), df) + # Ensure we don't drop rows due to nan values in unused columns. + self._run_test(lambda df: df.value_counts('num_wings'), df) if PD_VERSION >= (1, 3): # dropna=False is new in pandas 1.3