diff --git a/CHANGELOG.md b/CHANGELOG.md index 4db45f3a..0ac5b4c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed size mismatch `RuntimeError` in `transforms.CatToNumTransform` ([#446](https://github.com/pyg-team/pytorch-frame/pull/446)) - Removed CUDA synchronizations from `nn.LinearEmbeddingEncoder` ([#432](https://github.com/pyg-team/pytorch-frame/pull/432)) - Removed CUDA synchronizations from N/A imputation logic in `nn.StypeEncoder` ([#433](https://github.com/pyg-team/pytorch-frame/pull/433), [#434](https://github.com/pyg-team/pytorch-frame/pull/434)) diff --git a/test/transforms/test_cat_to_num_transform.py b/test/transforms/test_cat_to_num_transform.py index 9c53399c..e37e6f12 100644 --- a/test/transforms/test_cat_to_num_transform.py +++ b/test/transforms/test_cat_to_num_transform.py @@ -70,10 +70,16 @@ def test_cat_to_num_transform_on_categorical_only_dataset(with_nan): # Raise informative error when input tensor frame contains new category out = transform(tensor_frame) + # ensure different max value of y at test time works + tensor_frame.feat_dict[stype.categorical] = torch.zeros_like( + tensor_frame.feat_dict[stype.categorical]) + transform(tensor_frame) + @pytest.mark.parametrize('task_type', [ - TaskType.MULTICLASS_CLASSIFICATION, TaskType.REGRESSION, - TaskType.BINARY_CLASSIFICATION + TaskType.MULTICLASS_CLASSIFICATION, + TaskType.REGRESSION, + TaskType.BINARY_CLASSIFICATION, ]) def test_cat_to_num_transform_with_loading(task_type): num_rows = 10 diff --git a/torch_frame/transforms/cat_to_num_transform.py b/torch_frame/transforms/cat_to_num_transform.py index e1b4f008..09b8aa14 100644 --- a/torch_frame/transforms/cat_to_num_transform.py +++ b/torch_frame/transforms/cat_to_num_transform.py @@ -14,21 +14,27 @@ class CatToNumTransform(FittableBaseTransform): - r"""A transform that encodes the categorical features of - the :class:`TensorFrame` object using target statistics. - The original transform is explained in - https://dl.acm.org/doi/10.1145/507533.507538 - Specifically, each categorical feature is transformed - into numerical feature using m-probability estimate, - defined by (n_c + p * m)/ (n + m), where n_c is the - total count of the category, n is the total count, - p is the prior probability and m is a smoothing factor. + r"""Transforms categorical features in :class:`TensorFrame` using target + statistics. The original transform is explained in + `A preprocessing scheme for high-cardinality categorical attributes in + classification and prediction problems + `_ paper. + + Specifically, each categorical feature is transformed into numerical + feature using m-probability estimate, defined by + + .. math:: + \frac{n_c + p \cdot m}{n + m} + + where :math:`n_c` is the count of the category, :math:`n` is the total + count, :math:`p` is the prior probability and :math:`m` is a smoothing + factor. """ def _fit( self, tf_train: TensorFrame, col_stats: dict[str, dict[StatType, Any]], - ): + ) -> None: if tf_train.y is None: raise RuntimeError( "'{self.__class__.__name__}' cannot be used when target column" @@ -39,6 +45,7 @@ def _fit( "columns. No fitting will be performed.") self._transformed_stats = col_stats return + tensor = self._replace_nans(tf_train.feat_dict[stype.categorical], NAStrategy.MOST_FREQUENT) self.col_stats = col_stats @@ -50,16 +57,16 @@ def _fit( # the number of columns to (num_target_classes - 1). More details can # be found in https://dl.acm.org/doi/10.1145/507533.507538 if not torch.is_floating_point(tf_train.y) and tf_train.y.max() > 1: - num_classes = tf_train.y.max() + 1 - target = F.one_hot(tf_train.y, num_classes)[:, :-1] + self.num_classes = tf_train.y.max() + 1 + target = F.one_hot(tf_train.y, self.num_classes)[:, :-1] self.target_mean = target.float().mean(dim=0) - shape = tf_train.feat_dict[stype.categorical].shape - transformed_tensor = torch.zeros(shape[0], - shape[1] * (num_classes - 1), + num_rows, num_cols = tf_train.feat_dict[stype.categorical].shape + transformed_tensor = torch.zeros(num_rows, + num_cols * (self.num_classes - 1), dtype=torch.float32, device=tf_train.device) else: - num_classes = 2 + self.num_classes = 2 target = tf_train.y.unsqueeze(1) mask = ~torch.isnan(target) if (~mask).any(): @@ -76,11 +83,12 @@ def _fit( device=tf_train.device) feat = tensor[:, i] v = torch.index_select(count, 0, feat).unsqueeze(1).repeat( - 1, num_classes - 1) - transformed_tensor[:, i * (num_classes - 1):(i + 1) * - (num_classes - 1)] = ((v + self.target_mean) / - (self.data_size + 1)) - columns += [col_name + f"_{i}" for i in range(num_classes - 1)] + 1, self.num_classes - 1) + start = i * (self.num_classes - 1) + end = (i + 1) * (self.num_classes - 1) + transformed_tensor[:, start:end] = ((v + self.target_mean) / + (self.data_size + 1)) + columns += [f"{col_name}_{i}" for i in range(self.num_classes - 1)] self.new_columns = columns transformed_df = pd.DataFrame(transformed_tensor.cpu().numpy(), @@ -104,34 +112,41 @@ def _forward(self, tf: TensorFrame) -> TensorFrame: "The input TensorFrame does not contain any categorical " "columns. The original TensorFrame will be returned.") return tf - tensor = self._replace_nans(tf.feat_dict[stype.categorical], - NAStrategy.MOST_FREQUENT) + tensor = self._replace_nans( + tf.feat_dict[stype.categorical], + NAStrategy.MOST_FREQUENT, + ) if not torch.is_floating_point(tf.y) and tf.y.max() > 1: - num_classes = tf.y.max() + 1 - shape = tf.feat_dict[stype.categorical].shape - transformed_tensor = torch.zeros(shape[0], - shape[1] * (num_classes - 1), - dtype=torch.float32, - device=tf.device) + num_rows, num_cols = tf.feat_dict[stype.categorical].shape + transformed_tensor = torch.zeros( + num_rows, + num_cols * (self.num_classes - 1), + dtype=torch.float32, + device=tf.device, + ) else: - num_classes = 2 transformed_tensor = torch.zeros_like( - tf.feat_dict[stype.categorical], dtype=torch.float32) + tf.feat_dict[stype.categorical], + dtype=torch.float32, + ) target_mean = self.target_mean.to(tf.device) for i in range(len(tf.col_names_dict[stype.categorical])): col_name = tf.col_names_dict[stype.categorical][i] - count = torch.tensor(self.col_stats[col_name][StatType.COUNT][1], - device=tf.device) + count = torch.tensor( + self.col_stats[col_name][StatType.COUNT][1], + device=tf.device, + ) feat = tensor[:, i] max_cat = feat.max() if max_cat >= len(count): raise RuntimeError( - f"{col_name} contains new category {max_cat} not seen " + f"'{col_name}' contains new category '{max_cat}' not seen " f"during fit stage.") - v = count[feat].unsqueeze(1).repeat(1, num_classes - 1) - transformed_tensor[:, i * (num_classes - 1):(i + 1) * - (num_classes - 1)] = ((v + target_mean) / - (self.data_size + 1)) + v = count[feat].unsqueeze(1).repeat(1, self.num_classes - 1) + start = i * (self.num_classes - 1) + end = (i + 1) * (self.num_classes - 1) + transformed_tensor[:, start:end] = ((v + target_mean) / + (self.data_size + 1)) # turn the categorical features into numerical features if stype.numerical in tf.feat_dict: