Module pairwiseprediction.optimized
Expand source code
from heapq import heapify, heappop
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, ParameterSampler
from pairwiseprediction.classifier import PairwiseClassifier
from pairwiseprediction.combination import pair_rows
# TODO: tune rejection band during optimization ("class unknown")
class OptimizedPairwiseClassifier(PairwiseClassifier):
r"""
Optimized classifier through cross-validation suitable for pairwise prediction
Pairwise prediction is a specific setting that does not fit well within scikit framework.
RandomizedSearchCV can be used, but each pair leaks information, resulting in a too optimistic nested cross-validation.
:param search_space: dict like `{"param1": [val1, val2, val3], "param2": [val1, val2]}`.
:param k: number of k-folds CV.
:param algorithm: Class of algorithm to predict pairs internally.
:param pairwise: Type of combination: "difference" or "concatenation".
:param threshold: How much difference between target values should be considered as relevant within a pair?
:param proportion: Is the threshold an absolute value (difference) or relative value (proportion)?
:param center: Default value is the mean of the training sample.
:param only_relevant_pairs_on_prediction: Whether to keep only relevant differences during interpolation.
:param kwargs: Arguments for user-provided `algorithm`.
>>> import numpy as np
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.ensemble import RandomForestClassifier
>>> a, b = load_diabetes(return_X_y=True)
>>> me = np.mean(b)
>>> # noinspection PyUnresolvedReferences
>>> y = (b > me).astype(int)
>>> alg = RandomForestClassifier(n_estimators=3, random_state=0, n_jobs=-1)
>>> np.mean(cross_val_score(alg, a, y, cv=StratifiedKFold(n_splits=2))) # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE
0.6...
>>> c = b.reshape(len(b), 1)
>>> X = np.hstack([a, c])
>>> from scipy.stats import poisson, uniform
>>> spc = {
... 'criterion': ['gini', 'entropy'],
... 'max_depth': poisson(mu=5, loc=2),
... 'min_impurity_decrease': uniform(0, 0.01),
... 'max_leaf_nodes': poisson(mu=20, loc=5),
... 'min_samples_split': [20, 30, 40],
... 'min_samples_leaf': [10, 20, 30]
... }
>>> alg = OptimizedPairwiseClassifier(spc, 2, n_estimators=3, threshold=20, only_relevant_pairs_on_prediction=False, random_state=0, n_jobs=-1)
>>> round(np.mean(cross_val_score(alg, X[:50], y[:50], cv=StratifiedKFold(n_splits=2))), 2)
0.64
>>> alg = alg.fit(X[:80])
>>> alg.predict(X[:2])
array([1, 0])
>>> alg.best_score # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE
0.6...
>>> alg.best_params # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE
{'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 24, 'min_impurity_decrease': 0.001..., 'min_samples_leaf': 10, 'min_samples_split': 30}
>>> alg.opt_results # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE
[(0.61..., {'criterion': 'gini', 'max_depth': 9, 'max_leaf_nodes': 22, 'min_impurity_decrease': 0.008..., 'min_samples_leaf': 30, 'min_samples_split': 20}), (0.6023275581889547, {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 24, 'min_impurity_decrease': 0.0011827442586893322, 'min_samples_leaf': 10, 'min_samples_split': 30})]
"""
def __init__(
self,
search_space,
n_iter,
k=5,
seed=0,
algorithm=RandomForestClassifier,
pairwise="concatenation",
threshold=0,
proportion=False,
center=None,
only_relevant_pairs_on_prediction=False,
**kwargs
):
super().__init__(algorithm, pairwise, threshold, proportion, center, only_relevant_pairs_on_prediction, **kwargs)
self.search_space = search_space
self.n_iter = n_iter
self.k = k
self.seed = seed
# self.njobs = njobs
def fit(self, X, y=None):
"""
:param X: Last column is the continuous target.
:param y: Ignored.
:return:
"""
Xw = X if isinstance(X, np.ndarray) else np.array(X)
X = y = None
if self.center is None:
self.center = np.mean(Xw[:, -1])
w = Xw[:, -1]
# noinspection PyUnresolvedReferences
y = (w >= self.center).astype(int)
skf = StratifiedKFold(n_splits=self.k, random_state=self.seed, shuffle=True)
sampler = ParameterSampler(self.search_space, self.n_iter, random_state=self.seed)
lst = []
best_score = -1
for params in sampler:
ytss, ztss = [], []
for train_index, test_index in skf.split(Xw, y):
# prepare data sets
Xwtr = Xw[train_index]
Xwts = pair_rows(Xw[test_index], reflexive=True)
yts = (Xwts[:-1:2, -1] > Xwts[1::2, -1]).astype(int)
ytss.extend(yts)
# train with sampled arguments
super().fit_(Xwtr, extra_kwargs=params)
zts = super().predict(Xwts, paired_rows=True)[::2]
ztss.extend(zts)
score = balanced_accuracy_score(ytss, ztss)
lst.append((score, params))
if score > best_score:
self.best_score = score
self.best_params = params.copy()
self.opt_results = lst.copy()
super().fit_(Xw, extra_kwargs=self.best_params) # `_estimator` will contain the best_estimator
return self
def __sklearn_clone__(self):
return OptimizedPairwiseClassifier(self.search_space, self.n_iter, self.k, self.seed, self.algorithm, self.pairwise, self.threshold, self.proportion, self.center, self.only_relevant_pairs_on_prediction, **self.kwargs)
Classes
class OptimizedPairwiseClassifier (search_space, n_iter, k=5, seed=0, algorithm=sklearn.ensemble._forest.RandomForestClassifier, pairwise='concatenation', threshold=0, proportion=False, center=None, only_relevant_pairs_on_prediction=False, **kwargs)
-
Optimized classifier through cross-validation suitable for pairwise prediction
Pairwise prediction is a specific setting that does not fit well within scikit framework. RandomizedSearchCV can be used, but each pair leaks information, resulting in a too optimistic nested cross-validation.
:param search_space: dict like
{"param1": [val1, val2, val3], "param2": [val1, val2]}
. :param k: number of k-folds CV. :param algorithm: Class of algorithm to predict pairs internally. :param pairwise: Type of combination: "difference" or "concatenation". :param threshold: How much difference between target values should be considered as relevant within a pair? :param proportion: Is the threshold an absolute value (difference) or relative value (proportion)? :param center: Default value is the mean of the training sample. :param only_relevant_pairs_on_prediction: Whether to keep only relevant differences during interpolation. :param kwargs: Arguments for user-providedalgorithm
.>>> import numpy as np >>> from sklearn.datasets import load_diabetes >>> from sklearn.ensemble import RandomForestClassifier >>> a, b = load_diabetes(return_X_y=True) >>> me = np.mean(b) >>> # noinspection PyUnresolvedReferences >>> y = (b > me).astype(int) >>> alg = RandomForestClassifier(n_estimators=3, random_state=0, n_jobs=-1) >>> np.mean(cross_val_score(alg, a, y, cv=StratifiedKFold(n_splits=2))) # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE 0.6... >>> c = b.reshape(len(b), 1) >>> X = np.hstack([a, c]) >>> from scipy.stats import poisson, uniform >>> spc = { ... 'criterion': ['gini', 'entropy'], ... 'max_depth': poisson(mu=5, loc=2), ... 'min_impurity_decrease': uniform(0, 0.01), ... 'max_leaf_nodes': poisson(mu=20, loc=5), ... 'min_samples_split': [20, 30, 40], ... 'min_samples_leaf': [10, 20, 30] ... } >>> alg = OptimizedPairwiseClassifier(spc, 2, n_estimators=3, threshold=20, only_relevant_pairs_on_prediction=False, random_state=0, n_jobs=-1) >>> round(np.mean(cross_val_score(alg, X[:50], y[:50], cv=StratifiedKFold(n_splits=2))), 2) 0.64 >>> alg = alg.fit(X[:80]) >>> alg.predict(X[:2]) array([1, 0]) >>> alg.best_score # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE 0.6... >>> alg.best_params # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 24, 'min_impurity_decrease': 0.001..., 'min_samples_leaf': 10, 'min_samples_split': 30} >>> alg.opt_results # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE [(0.61..., {'criterion': 'gini', 'max_depth': 9, 'max_leaf_nodes': 22, 'min_impurity_decrease': 0.008..., 'min_samples_leaf': 30, 'min_samples_split': 20}), (0.6023275581889547, {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 24, 'min_impurity_decrease': 0.0011827442586893322, 'min_samples_leaf': 10, 'min_samples_split': 30})]
Expand source code
class OptimizedPairwiseClassifier(PairwiseClassifier): r""" Optimized classifier through cross-validation suitable for pairwise prediction Pairwise prediction is a specific setting that does not fit well within scikit framework. RandomizedSearchCV can be used, but each pair leaks information, resulting in a too optimistic nested cross-validation. :param search_space: dict like `{"param1": [val1, val2, val3], "param2": [val1, val2]}`. :param k: number of k-folds CV. :param algorithm: Class of algorithm to predict pairs internally. :param pairwise: Type of combination: "difference" or "concatenation". :param threshold: How much difference between target values should be considered as relevant within a pair? :param proportion: Is the threshold an absolute value (difference) or relative value (proportion)? :param center: Default value is the mean of the training sample. :param only_relevant_pairs_on_prediction: Whether to keep only relevant differences during interpolation. :param kwargs: Arguments for user-provided `algorithm`. >>> import numpy as np >>> from sklearn.datasets import load_diabetes >>> from sklearn.ensemble import RandomForestClassifier >>> a, b = load_diabetes(return_X_y=True) >>> me = np.mean(b) >>> # noinspection PyUnresolvedReferences >>> y = (b > me).astype(int) >>> alg = RandomForestClassifier(n_estimators=3, random_state=0, n_jobs=-1) >>> np.mean(cross_val_score(alg, a, y, cv=StratifiedKFold(n_splits=2))) # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE 0.6... >>> c = b.reshape(len(b), 1) >>> X = np.hstack([a, c]) >>> from scipy.stats import poisson, uniform >>> spc = { ... 'criterion': ['gini', 'entropy'], ... 'max_depth': poisson(mu=5, loc=2), ... 'min_impurity_decrease': uniform(0, 0.01), ... 'max_leaf_nodes': poisson(mu=20, loc=5), ... 'min_samples_split': [20, 30, 40], ... 'min_samples_leaf': [10, 20, 30] ... } >>> alg = OptimizedPairwiseClassifier(spc, 2, n_estimators=3, threshold=20, only_relevant_pairs_on_prediction=False, random_state=0, n_jobs=-1) >>> round(np.mean(cross_val_score(alg, X[:50], y[:50], cv=StratifiedKFold(n_splits=2))), 2) 0.64 >>> alg = alg.fit(X[:80]) >>> alg.predict(X[:2]) array([1, 0]) >>> alg.best_score # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE 0.6... >>> alg.best_params # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 24, 'min_impurity_decrease': 0.001..., 'min_samples_leaf': 10, 'min_samples_split': 30} >>> alg.opt_results # doctest:+ELLIPSIS +NORMALIZE_WHITESPACE [(0.61..., {'criterion': 'gini', 'max_depth': 9, 'max_leaf_nodes': 22, 'min_impurity_decrease': 0.008..., 'min_samples_leaf': 30, 'min_samples_split': 20}), (0.6023275581889547, {'criterion': 'entropy', 'max_depth': 4, 'max_leaf_nodes': 24, 'min_impurity_decrease': 0.0011827442586893322, 'min_samples_leaf': 10, 'min_samples_split': 30})] """ def __init__( self, search_space, n_iter, k=5, seed=0, algorithm=RandomForestClassifier, pairwise="concatenation", threshold=0, proportion=False, center=None, only_relevant_pairs_on_prediction=False, **kwargs ): super().__init__(algorithm, pairwise, threshold, proportion, center, only_relevant_pairs_on_prediction, **kwargs) self.search_space = search_space self.n_iter = n_iter self.k = k self.seed = seed # self.njobs = njobs def fit(self, X, y=None): """ :param X: Last column is the continuous target. :param y: Ignored. :return: """ Xw = X if isinstance(X, np.ndarray) else np.array(X) X = y = None if self.center is None: self.center = np.mean(Xw[:, -1]) w = Xw[:, -1] # noinspection PyUnresolvedReferences y = (w >= self.center).astype(int) skf = StratifiedKFold(n_splits=self.k, random_state=self.seed, shuffle=True) sampler = ParameterSampler(self.search_space, self.n_iter, random_state=self.seed) lst = [] best_score = -1 for params in sampler: ytss, ztss = [], [] for train_index, test_index in skf.split(Xw, y): # prepare data sets Xwtr = Xw[train_index] Xwts = pair_rows(Xw[test_index], reflexive=True) yts = (Xwts[:-1:2, -1] > Xwts[1::2, -1]).astype(int) ytss.extend(yts) # train with sampled arguments super().fit_(Xwtr, extra_kwargs=params) zts = super().predict(Xwts, paired_rows=True)[::2] ztss.extend(zts) score = balanced_accuracy_score(ytss, ztss) lst.append((score, params)) if score > best_score: self.best_score = score self.best_params = params.copy() self.opt_results = lst.copy() super().fit_(Xw, extra_kwargs=self.best_params) # `_estimator` will contain the best_estimator return self def __sklearn_clone__(self): return OptimizedPairwiseClassifier(self.search_space, self.n_iter, self.k, self.seed, self.algorithm, self.pairwise, self.threshold, self.proportion, self.center, self.only_relevant_pairs_on_prediction, **self.kwargs)
Ancestors
- PairwiseClassifier
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.ClassifierMixin
Methods
def fit(self, X, y=None)
-
:param X: Last column is the continuous target. :param y: Ignored.
:return:
Expand source code
def fit(self, X, y=None): """ :param X: Last column is the continuous target. :param y: Ignored. :return: """ Xw = X if isinstance(X, np.ndarray) else np.array(X) X = y = None if self.center is None: self.center = np.mean(Xw[:, -1]) w = Xw[:, -1] # noinspection PyUnresolvedReferences y = (w >= self.center).astype(int) skf = StratifiedKFold(n_splits=self.k, random_state=self.seed, shuffle=True) sampler = ParameterSampler(self.search_space, self.n_iter, random_state=self.seed) lst = [] best_score = -1 for params in sampler: ytss, ztss = [], [] for train_index, test_index in skf.split(Xw, y): # prepare data sets Xwtr = Xw[train_index] Xwts = pair_rows(Xw[test_index], reflexive=True) yts = (Xwts[:-1:2, -1] > Xwts[1::2, -1]).astype(int) ytss.extend(yts) # train with sampled arguments super().fit_(Xwtr, extra_kwargs=params) zts = super().predict(Xwts, paired_rows=True)[::2] ztss.extend(zts) score = balanced_accuracy_score(ytss, ztss) lst.append((score, params)) if score > best_score: self.best_score = score self.best_params = params.copy() self.opt_results = lst.copy() super().fit_(Xw, extra_kwargs=self.best_params) # `_estimator` will contain the best_estimator return self
Inherited members