Skip to content

Commit

Permalink
Add function and unit tests to ensure that the altitude, latitude and…
Browse files Browse the repository at this point in the history
… longitude columns have the same, consistent value for a given site. This will prevent issues when these columns are used when creating a cube and mismatching values will prevent cubes being merged successfully. (#1951)
  • Loading branch information
gavinevans authored Oct 5, 2023
1 parent 64b8eab commit ac901c4
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 0 deletions.
40 changes: 40 additions & 0 deletions improver/calibration/dataframe_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,41 @@ def _fill_missing_entries(df, combi_cols, static_cols, site_id_col):
return df


def _ensure_consistent_static_cols(
forecast_df: DataFrame, static_cols: List[str], site_id_col: str
) -> DataFrame:
"""Ensure that the columns expected to have the same value for a given site,
actually have the same values. These "static" columns could change if,
for example, the altitude of a site is corrected.
Args:
forecast_df: Forecast DataFrame.
static_cols: List of columns that are expected to be "static".
site_id_col: The name of the column containing the site ID.
Returns:
Forecast DataFrame with the same value for a given site for the static columns
provided.
"""
# Check if any of the assumed static columns are actually not static when
# the DataFrame is grouped by the site_id_col.
if (forecast_df.groupby(site_id_col)[static_cols].nunique().nunique() > 1).any():

for static_col in static_cols:
# For each static column, find the last value from the list of unique
# values for each site. The last value corresponds to the most recent value
# present when using pd.unique.
temp_df = forecast_df.groupby(site_id_col)[static_col].apply(
lambda x: pd.unique(x)[-1]
)
# Drop the static column and then merge. The merge will recreate the static
# column using a constant value for each site.
forecast_df = forecast_df.drop(columns=static_col)
forecast_df = forecast_df.merge(temp_df, on=site_id_col)

return forecast_df


def _define_time_coord(
adate: pd.Timestamp, time_bounds: Optional[Sequence[pd.Timestamp]] = None,
) -> DimCoord:
Expand Down Expand Up @@ -513,10 +548,15 @@ def _prepare_dataframes(
# Add station_id as a static column, if it is only present in the
# forecast DataFrame.
static_cols.append("station_id")

forecast_df = _fill_missing_entries(
forecast_df, combi_cols, static_cols, site_id_col
)

forecast_df = _ensure_consistent_static_cols(
forecast_df, ["altitude", "latitude", "longitude"], site_id_col
)

combi_cols = [site_id_col, "time"]
static_cols = ["latitude", "longitude", "altitude", "diagnostic"]
if include_station_id:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,61 @@ def test_no_forecasts_for_a_time(self):
self.assertCubeEqual(result[0], self.expected_period_forecast[:, 1:])
self.assertCubeEqual(result[1], self.expected_period_truth[1:])

def test_moving_forecast_site(self):
"""Test that if a site has different values for the altitude, latitude or
longitude at different times within the forecasts, potentially due to a
site update, the altitude, latitude and longitude from the most recent
time point is assigned to all times."""
df = self.forecast_df
condition1 = (df["wmo_id"] == "03002") & (df["time"] == self.time3)
df.loc[condition1, "altitude"] = 5
df.loc[condition1, "latitude"] = 40
df.loc[condition1, "longitude"] = -15

expected_period_forecast = self.expected_period_forecast.copy()
expected_period_forecast.coord("altitude").points[0] = 5
expected_period_forecast.coord("latitude").points[0] = 40
expected_period_forecast.coord("longitude").points[0] = -15
expected_period_truth = self.expected_period_truth.copy()
expected_period_truth.coord("altitude").points[0] = 5
expected_period_truth.coord("latitude").points[0] = 40
expected_period_truth.coord("longitude").points[0] = -15

result = forecast_and_truth_dataframes_to_cubes(
df,
self.truth_subset_df,
self.cycletime,
self.forecast_period,
self.training_length,
)

self.assertEqual(len(result), 2)
self.assertCubeEqual(result[0], expected_period_forecast)
self.assertCubeEqual(result[1], expected_period_truth)

def test_moving_truth_site(self):
"""Test that if a site has different values for the altitude, latitude or
longitude at different times within the truths, this has no effect on the
outputs as only the altitude, latitude and longitude from the forecasts are
preserved."""
df = self.truth_subset_df
condition1 = (df["wmo_id"] == "03002") & (df["time"] == self.time3)
df.loc[condition1, "altitude"] = 5
df.loc[condition1, "latitude"] = 40
df.loc[condition1, "longitude"] = -15

result = forecast_and_truth_dataframes_to_cubes(
self.forecast_df,
df,
self.cycletime,
self.forecast_period,
self.training_length,
)

self.assertEqual(len(result), 2)
self.assertCubeEqual(result[0], self.expected_period_forecast)
self.assertCubeEqual(result[1], self.expected_period_truth)

def test_new_site_with_only_one_forecast_and_truth(self):
"""Test for a site that has a forecast and truth data point for the most
recent time only. Other sites are present at all forecast and truth times.
Expand Down

0 comments on commit ac901c4

Please sign in to comment.