From 8807d0ece4dda3d3832bbe130c520f498aae8718 Mon Sep 17 00:00:00 2001 From: caneff Date: Thu, 21 Sep 2023 12:05:27 -0400 Subject: [PATCH] Fix remaining tests for pandas 2 compatibility (#28524) --- .../apache_beam/dataframe/frames_test.py | 4 ++ .../dataframe/pandas_doctests_test.py | 61 ++++++++++++++----- .../dataframe/pandas_top_level_functions.py | 1 + .../interactive/interactive_runner_test.py | 2 +- 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index e3555b50187b..6f7a63c29164 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -1936,6 +1936,8 @@ def test_groupby_sum_min_count(self): self._run_test(lambda df: df.groupby('group').sum(min_count=2), df) + @unittest.skipIf( + PD_VERSION >= (2, 0), "dtypes on groups is deprecated in Pandas 2.") def test_groupby_dtypes(self): self._run_test( lambda df: df.groupby('group').dtypes, GROUPBY_DF, check_proxy=False) @@ -2159,6 +2161,7 @@ def test_dataframe_agg_level(self): level=1, numeric_only=True), GROUPBY_DF) + @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2") def test_series_agg_multifunc_level(self): # level= is ignored for multiple agg fns self._run_test( @@ -2181,6 +2184,7 @@ def test_series_mean_skipna(self): self._run_test(lambda df: df.two.mean(skipna=True), df) self._run_test(lambda df: df.three.mean(skipna=True), df) + @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2") def test_dataframe_agg_multifunc_level(self): # level= is ignored for multiple agg fns self._run_test( diff --git a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py index 56eddd3cfb92..4fb05780fbec 100644 --- a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py +++ b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py @@ -164,6 +164,9 @@ def test_ndframe_tests(self): ' key=lambda x: np.argsort(index_natsorted(df["time"]))\n' ')' ], + # TODO(https://github.com/apache/beam/issues/28559): Re-enable when + # bug is fixed. + 'pandas.core.generic.NDFrame.xs': ['*'], **skip_writes }) self.assertEqual(result.failed, 0) @@ -296,13 +299,19 @@ def test_dataframe_tests(self): 'pandas.core.frame.DataFrame.value_counts': [ 'df.value_counts(dropna=False)' ], + + 'pandas.core.frame.DataFrame.to_timestamp': ['*'] }, skip={ - # DataFrame construction from a dictionary and - # Series requires using the len() function, which - # is a non-deferred operation that we do not allow + # DataFrame construction from a dictionary, Series, or other + # DataFrame requires using the len() function, which is a + # non-deferred operation that we do not allow 'pandas.core.frame.DataFrame': [ 'pd.DataFrame(data=d, index=[0, 1, 2, 3])', + 'df = pd.DataFrame(data=ser, index=["a", "c"])', + 'df', + 'df2 = pd.DataFrame(data=df1, index=["a", "c"])', + 'df2', ], # s2 created with reindex 'pandas.core.frame.DataFrame.dot': [ @@ -361,15 +370,17 @@ def test_dataframe_tests(self): # actually raise NotImplementedError 'pandas.core.frame.DataFrame.pivot_table': ['*'], # Expected to raise a ValueError, but we raise NotImplementedError + # pylint: disable=line-too-long 'pandas.core.frame.DataFrame.pivot': [ "df.pivot(index='foo', columns='bar', values='baz')", "df.pivot(index='foo', columns='bar')['baz']", "df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])", - # pylint: disable=line-too-long 'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")', - # pylint: disable=line-too-long - 'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")' + 'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")', + 'df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")', + 'df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")', ], + # pylint: enable=line-too-long 'pandas.core.frame.DataFrame.append': [ 'df', # pylint: disable=line-too-long @@ -511,6 +522,8 @@ def test_series_tests(self): 'ser.groupby(["a", "b", "a", np.nan]).mean()', 'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()', ], + 'pandas.core.series.Series.to_period': ['*'], + 'pandas.core.series.Series.to_timestamp': ['*'], }, skip={ # Relies on setting values with iloc @@ -535,6 +548,8 @@ def test_series_tests(self): 'pandas.core.series.Series.idxmin': ['s.idxmin()'], 'pandas.core.series.Series.idxmax': ['s.idxmax()'], 'pandas.core.series.Series.duplicated': ['*'], + # Relies on setting index. + 'pandas.core.series.Series.rename_axis': ['*'], 'pandas.core.series.Series.set_axis': ['*'], 'pandas.core.series.Series.nonzero': ['*'], 'pandas.core.series.Series.pop': ['ser'], # testing side effect @@ -710,6 +725,7 @@ def test_groupby_tests(self): 'pandas.core.groupby.groupby.GroupBy.nth': ['*'], 'pandas.core.groupby.groupby.GroupBy.cumcount': ['*'], 'pandas.core.groupby.groupby.GroupBy.resample': ['*'], + 'pandas.core.groupby.groupby.GroupBy.rolling': ['*'], }, not_implemented_ok={ 'pandas.core.groupby.groupby.GroupBy.first': ['*'], @@ -764,16 +780,21 @@ def test_groupby_tests(self): 'df.fillna(method=\'ffill\')', 'df.fillna(method="ffill")', 'df.fillna(value=values, limit=1)', + 'df.groupby("key").fillna(method="ffill")', + 'df.groupby("key").fillna(method="bfill")', + 'df.groupby("key").fillna(method="ffill", limit=1)', ], 'pandas.core.groupby.generic.SeriesGroupBy.fillna': [ 'df.fillna(method=\'ffill\')', 'df.fillna(method="ffill")', 'df.fillna(value=values, limit=1)', ], + 'pandas.core.groupby.groupby.GroupBy.tail': ['*'], }, not_implemented_ok={ 'pandas.core.groupby.generic.DataFrameGroupBy.idxmax': ['*'], 'pandas.core.groupby.generic.DataFrameGroupBy.idxmin': ['*'], + 'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'], @@ -794,14 +815,6 @@ def test_groupby_tests(self): # These examples rely on grouping by a list 'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'], 'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'], - 'pandas.core.groupby.generic.SeriesGroupBy.transform': [ - # Dropping invalid columns during a transform is unsupported. - 'grouped.transform(lambda x: (x - x.mean()) / x.std())' - ], - 'pandas.core.groupby.generic.DataFrameGroupBy.transform': [ - # Dropping invalid columns during a transform is unsupported. - 'grouped.transform(lambda x: (x - x.mean()) / x.std())' - ], # Skipped idxmax/idxmin due an issue with the test framework 'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['s.idxmin()'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['s.idxmax()'], @@ -811,7 +824,24 @@ def test_groupby_tests(self): # pylint: disable=line-too-long "df.groupby('gender', as_index=False).value_counts(normalize=True)", ], - }) + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.SeriesGroupBy.fillna': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.DataFrameGroupBy.fillna': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.SeriesGroupBy.take': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.DataFrameGroupBy.take': ['*'], + # Named aggregation not supported yet. + 'pandas.core.groupby.generic.NamedAgg': [ + 'df.groupby("key").agg(result_a=agg_a, result_1=agg_1)' + ], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'], + }, + ) self.assertEqual(result.failed, 0) def test_top_level(self): @@ -843,7 +873,6 @@ def test_top_level(self): 'pivot_table': ['*'], 'qcut': ['*'], 'reset_option': ['*'], - 'set_eng_float_format': ['*'], 'set_option': ['*'], 'to_numeric': ['*'], 'to_timedelta': ['*'], diff --git a/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py b/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py index 39df3f25a2e8..ce36dbeb09ad 100644 --- a/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py +++ b/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py @@ -162,6 +162,7 @@ def concat( period_range = _defer_to_pandas('period_range') pivot = _call_on_first_arg('pivot') pivot_table = _call_on_first_arg('pivot_table') + set_eng_float_format = _defer_to_pandas('set_eng_float_format') show_versions = _defer_to_pandas('show_versions') test = frame_base.wont_implement_method( pd, diff --git a/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py b/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py index 5ffa6224edb0..1da20fb2dfa9 100644 --- a/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py +++ b/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py @@ -321,7 +321,7 @@ def test_dataframes_with_grouped_index(self): Record('c', 18, 150) ] - aggregate = lambda df: df.groupby('height').mean() + aggregate = lambda df: df.groupby('height').mean(numeric_only=True) deferred_df = aggregate(to_dataframe(p | beam.Create(data))) df_expected = aggregate(pd.DataFrame(data))