Skip to content

Commit

Permalink
feat: regularization for decision trees and random forests (#730)
Browse files Browse the repository at this point in the history
Closes #700

### Summary of Changes

Add regularization options for decision trees and random forests:
* maximum depth
* minimum number of samples in leaves
  • Loading branch information
lars-reimann authored May 5, 2024
1 parent 1cc14b1 commit 102de2d
Show file tree
Hide file tree
Showing 8 changed files with 420 additions and 51 deletions.
70 changes: 63 additions & 7 deletions src/safeds/ml/classical/classification/_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds.exceptions import ClosedBound, OutOfBoundsError
from safeds.ml.classical._util_sklearn import fit, predict

from ._classifier import Classifier
Expand All @@ -16,17 +17,66 @@


class DecisionTreeClassifier(Classifier):
"""Decision tree classification."""
"""
Decision tree classification.
Parameters
----------
maximum_depth:
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
minimum_number_of_samples_in_leaves:
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.
Raises
------
OutOfBoundsError
If `maximum_depth` is less than 1.
OutOfBoundsError
If `minimum_number_of_samples_in_leaves` is less than 1.
"""

def __init__(
self,
*,
maximum_depth: int | None = None,
minimum_number_of_samples_in_leaves: int = 1,
) -> None:
# Validation
if maximum_depth is not None and maximum_depth < 1:
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
if minimum_number_of_samples_in_leaves < 1:
raise OutOfBoundsError(
minimum_number_of_samples_in_leaves,
name="minimum_number_of_samples_in_leaves",
lower_bound=ClosedBound(1),
)

# Hyperparameters
self._maximum_depth: int | None = maximum_depth
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves

def __hash__(self) -> int:
return _structural_hash(Classifier.__hash__(self), self._target_name, self._feature_names)

def __init__(self) -> None:
# Internal state
self._wrapped_classifier: sk_DecisionTreeClassifier | None = None
self._feature_names: list[str] | None = None
self._target_name: str | None = None

def __hash__(self) -> int:
return _structural_hash(
Classifier.__hash__(self),
self._feature_names,
self._target_name,
)

@property
def maximum_depth(self) -> int | None:
"""The maximum depth of the tree."""
return self._maximum_depth

@property
def minimum_number_of_samples_in_leaves(self) -> int:
"""The minimum number of samples that must remain in the leaves of the tree."""
return self._minimum_number_of_samples_in_leaves

def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier:
"""
Create a copy of this classifier and fit it with the given training data.
Expand Down Expand Up @@ -59,7 +109,10 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier:
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)

result = DecisionTreeClassifier()
result = DecisionTreeClassifier(
maximum_depth=self._maximum_depth,
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
)
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.column_names
result._target_name = training_set.target.name
Expand Down Expand Up @@ -105,4 +158,7 @@ def is_fitted(self) -> bool:
def _get_sklearn_classifier(self) -> ClassifierMixin:
from sklearn.tree import DecisionTreeClassifier as sk_DecisionTreeClassifier

return sk_DecisionTreeClassifier()
return sk_DecisionTreeClassifier(
max_depth=self._maximum_depth,
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
)
81 changes: 60 additions & 21 deletions src/safeds/ml/classical/classification/_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,52 +17,82 @@


class RandomForestClassifier(Classifier):
"""Random forest classification.
"""
Random forest classification.
Parameters
----------
number_of_trees:
The number of trees to be used in the random forest. Has to be greater than 0.
maximum_depth:
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
minimum_number_of_samples_in_leaves:
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.
Raises
------
OutOfBoundsError
If `number_of_trees` is less than 1.
OutOfBoundsError
If `maximum_depth` is less than 1.
OutOfBoundsError
If `minimum_number_of_samples_in_leaves` is less than 1.
"""

def __hash__(self) -> int:
return _structural_hash(
Classifier.__hash__(self),
self._target_name,
self._feature_names,
self._number_of_trees,
)

def __init__(self, *, number_of_trees: int = 100) -> None:
def __init__(
self,
*,
number_of_trees: int = 100,
maximum_depth: int | None = None,
minimum_number_of_samples_in_leaves: int = 1,
) -> None:
# Validation
if number_of_trees < 1:
raise OutOfBoundsError(number_of_trees, name="number_of_trees", lower_bound=ClosedBound(1))
if maximum_depth is not None and maximum_depth < 1:
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
if minimum_number_of_samples_in_leaves < 1:
raise OutOfBoundsError(
minimum_number_of_samples_in_leaves,
name="minimum_number_of_samples_in_leaves",
lower_bound=ClosedBound(1),
)

# Hyperparameters
self._number_of_trees = number_of_trees
self._number_of_trees: int = number_of_trees
self._maximum_depth: int | None = maximum_depth
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves

# Internal state
self._wrapped_classifier: sk_RandomForestClassifier | None = None
self._feature_names: list[str] | None = None
self._target_name: str | None = None

def __hash__(self) -> int:
return _structural_hash(
Classifier.__hash__(self),
self._feature_names,
self._target_name,
self._number_of_trees,
self._maximum_depth,
self._minimum_number_of_samples_in_leaves,
)

@property
def number_of_trees(self) -> int:
"""
Get the number of trees used in the random forest.
Returns
-------
result:
The number of trees.
"""
"""The number of trees used in the random forest."""
return self._number_of_trees

@property
def maximum_depth(self) -> int | None:
"""The maximum depth of each tree."""
return self._maximum_depth

@property
def minimum_number_of_samples_in_leaves(self) -> int:
"""The minimum number of samples that must remain in the leaves of each tree."""
return self._minimum_number_of_samples_in_leaves

def fit(self, training_set: TabularDataset) -> RandomForestClassifier:
"""
Create a copy of this classifier and fit it with the given training data.
Expand Down Expand Up @@ -95,7 +125,11 @@ def fit(self, training_set: TabularDataset) -> RandomForestClassifier:
wrapped_classifier = self._get_sklearn_classifier()
fit(wrapped_classifier, training_set)

result = RandomForestClassifier(number_of_trees=self._number_of_trees)
result = RandomForestClassifier(
number_of_trees=self._number_of_trees,
maximum_depth=self._maximum_depth,
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
)
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.column_names
result._target_name = training_set.target.name
Expand Down Expand Up @@ -149,4 +183,9 @@ def _get_sklearn_classifier(self) -> ClassifierMixin:
"""
from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier

return sk_RandomForestClassifier(self._number_of_trees, n_jobs=-1)
return sk_RandomForestClassifier(
n_estimators=self._number_of_trees,
max_depth=self._maximum_depth,
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
n_jobs=-1,
)
70 changes: 63 additions & 7 deletions src/safeds/ml/classical/regression/_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds.exceptions import ClosedBound, OutOfBoundsError
from safeds.ml.classical._util_sklearn import fit, predict

from ._regressor import Regressor
Expand All @@ -16,17 +17,66 @@


class DecisionTreeRegressor(Regressor):
"""Decision tree regression."""
"""
Decision tree regression.
Parameters
----------
maximum_depth:
The maximum depth of each tree. If None, the depth is not limited. Has to be greater than 0.
minimum_number_of_samples_in_leaves:
The minimum number of samples that must remain in the leaves of each tree. Has to be greater than 0.
Raises
------
OutOfBoundsError
If `maximum_depth` is less than 1.
OutOfBoundsError
If `minimum_number_of_samples_in_leaves` is less than 1.
"""

def __init__(
self,
*,
maximum_depth: int | None = None,
minimum_number_of_samples_in_leaves: int = 5,
) -> None:
# Validation
if maximum_depth is not None and maximum_depth < 1:
raise OutOfBoundsError(maximum_depth, name="maximum_depth", lower_bound=ClosedBound(1))
if minimum_number_of_samples_in_leaves < 1:
raise OutOfBoundsError(
minimum_number_of_samples_in_leaves,
name="minimum_number_of_samples_in_leaves",
lower_bound=ClosedBound(1),
)

# Hyperparameters
self._maximum_depth: int | None = maximum_depth
self._minimum_number_of_samples_in_leaves: int = minimum_number_of_samples_in_leaves

def __hash__(self) -> int:
return _structural_hash(Regressor.__hash__(self), self._target_name, self._feature_names)

def __init__(self) -> None:
# Internal state
self._wrapped_regressor: sk_DecisionTreeRegressor | None = None
self._feature_names: list[str] | None = None
self._target_name: str | None = None

def __hash__(self) -> int:
return _structural_hash(
Regressor.__hash__(self),
self._feature_names,
self._target_name,
)

@property
def maximum_depth(self) -> int | None:
"""The maximum depth of the tree."""
return self._maximum_depth

@property
def minimum_number_of_samples_in_leaves(self) -> int:
"""The minimum number of samples that must remain in the leaves of the tree."""
return self._minimum_number_of_samples_in_leaves

def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor:
"""
Create a copy of this regressor and fit it with the given training data.
Expand Down Expand Up @@ -59,7 +109,10 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor:
wrapped_regressor = self._get_sklearn_regressor()
fit(wrapped_regressor, training_set)

result = DecisionTreeRegressor()
result = DecisionTreeRegressor(
maximum_depth=self._maximum_depth,
minimum_number_of_samples_in_leaves=self._minimum_number_of_samples_in_leaves,
)
result._wrapped_regressor = wrapped_regressor
result._feature_names = training_set.features.column_names
result._target_name = training_set.target.name
Expand Down Expand Up @@ -113,4 +166,7 @@ def _get_sklearn_regressor(self) -> RegressorMixin:
"""
from sklearn.tree import DecisionTreeRegressor as sk_DecisionTreeRegressor

return sk_DecisionTreeRegressor()
return sk_DecisionTreeRegressor(
max_depth=self._maximum_depth,
min_samples_leaf=self._minimum_number_of_samples_in_leaves,
)
Loading

0 comments on commit 102de2d

Please sign in to comment.