Note
Click here to download the full example code
Testing DuBE with different number of classes (3-15)
In this example, we compare the duplebalance.DupleBalanceClassifier
and other ensemble-based class-imbalanced learning methods on multi-class
tasks (with number of classes varying from 3 to 15).
print(__doc__)
RANDOM_STATE = 42
Preparation
Import necessary packages.
from duplebalance import DupleBalanceClassifier
from duplebalance.base import sort_dict_by_key
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
Train All Ensemble Classifier
Train all ensemble-based IL classifier (including DuBE) on multi-class datasets.
from imbalanced_ensemble.ensemble import *
ensemble_init_kwargs = {
'base_estimator': DecisionTreeClassifier(),
'n_estimators': 10,
'random_state': RANDOM_STATE,
}
dube_fit_kwargs = {
'resampling_target': 'hybrid',
'resampling_strategy': 'shem',
'perturb_alpha': .5,
}
eval_kwargs = {'average': 'macro', 'multi_class': 'ovo'}
ensemble_clfs = {
'DuBE': DupleBalanceClassifier,
'RusBoost': RUSBoostClassifier,
'OverBoost': OverBoostClassifier,
'SmoteBoost': SMOTEBoostClassifier,
'RusBoost': RUSBoostClassifier,
'UnderBagging': UnderBaggingClassifier,
'OverBagging': OverBaggingClassifier,
'SmoteBagging': SMOTEBaggingClassifier,
'Cascade': BalanceCascadeClassifier,
'SelfPacedEns': SelfPacedEnsembleClassifier,
}
# Initialize results list
all_results = []
for n_class in range(3, 16):
# Assign long-tail class weights
weights = np.array([np.power(.8, i) for i in range(n_class)])
weights /= weights.sum()
info = "#Classes: {}\nImbalance Ratio: ".format(n_class)
for weight in weights:
info += '{:.2f}/'.format(weight/weights.min())
print (info.rstrip('/'))
# Generate synthetic multi-class imbalanced dataset
X, y = make_classification(n_classes=n_class, class_sep=1,
weights=weights, n_informative=4, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
for ens_name, clf_class in ensemble_clfs.items():
# Train all ensemble classifiers
clf = clf_class(
**ensemble_init_kwargs
)
if ens_name == 'DuBE':
clf.fit(X_train, y_train, **dube_fit_kwargs)
else: clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)
score = roc_auc_score(y_test, y_pred_proba, **eval_kwargs)
all_results.append([ens_name, score, n_class])
print ("{:<15s} | Balanced AUROC: {:.3f}".format(ens_name, score))
Out:
#Classes: 3
Imbalance Ratio: 1.56/1.25/1.00
DuBE | Balanced AUROC: 0.998
RusBoost | Balanced AUROC: 0.989
OverBoost | Balanced AUROC: 0.976
SmoteBoost | Balanced AUROC: 0.973
UnderBagging | Balanced AUROC: 0.996
OverBagging | Balanced AUROC: 0.995
SmoteBagging | Balanced AUROC: 0.995
Cascade | Balanced AUROC: 0.991
SelfPacedEns | Balanced AUROC: 0.992
#Classes: 4
Imbalance Ratio: 1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.984
RusBoost | Balanced AUROC: 0.898
OverBoost | Balanced AUROC: 0.894
SmoteBoost | Balanced AUROC: 0.896
UnderBagging | Balanced AUROC: 0.966
OverBagging | Balanced AUROC: 0.966
SmoteBagging | Balanced AUROC: 0.969
Cascade | Balanced AUROC: 0.964
SelfPacedEns | Balanced AUROC: 0.970
#Classes: 5
Imbalance Ratio: 2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.990
RusBoost | Balanced AUROC: 0.927
OverBoost | Balanced AUROC: 0.925
SmoteBoost | Balanced AUROC: 0.924
UnderBagging | Balanced AUROC: 0.974
OverBagging | Balanced AUROC: 0.978
SmoteBagging | Balanced AUROC: 0.978
Cascade | Balanced AUROC: 0.977
SelfPacedEns | Balanced AUROC: 0.981
#Classes: 6
Imbalance Ratio: 3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.982
RusBoost | Balanced AUROC: 0.890
OverBoost | Balanced AUROC: 0.877
SmoteBoost | Balanced AUROC: 0.863
UnderBagging | Balanced AUROC: 0.964
OverBagging | Balanced AUROC: 0.963
SmoteBagging | Balanced AUROC: 0.969
Cascade | Balanced AUROC: 0.966
SelfPacedEns | Balanced AUROC: 0.971
#Classes: 7
Imbalance Ratio: 3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.972
RusBoost | Balanced AUROC: 0.901
OverBoost | Balanced AUROC: 0.840
SmoteBoost | Balanced AUROC: 0.855
UnderBagging | Balanced AUROC: 0.953
OverBagging | Balanced AUROC: 0.955
SmoteBagging | Balanced AUROC: 0.954
Cascade | Balanced AUROC: 0.958
SelfPacedEns | Balanced AUROC: 0.954
#Classes: 8
Imbalance Ratio: 4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.967
RusBoost | Balanced AUROC: 0.863
OverBoost | Balanced AUROC: 0.819
SmoteBoost | Balanced AUROC: 0.841
UnderBagging | Balanced AUROC: 0.947
OverBagging | Balanced AUROC: 0.943
SmoteBagging | Balanced AUROC: 0.954
Cascade | Balanced AUROC: 0.954
SelfPacedEns | Balanced AUROC: 0.952
#Classes: 9
Imbalance Ratio: 5.96/4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.964
RusBoost | Balanced AUROC: 0.861
OverBoost | Balanced AUROC: 0.829
SmoteBoost | Balanced AUROC: 0.828
UnderBagging | Balanced AUROC: 0.939
OverBagging | Balanced AUROC: 0.939
SmoteBagging | Balanced AUROC: 0.944
Cascade | Balanced AUROC: 0.943
SelfPacedEns | Balanced AUROC: 0.942
#Classes: 10
Imbalance Ratio: 7.45/5.96/4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.968
RusBoost | Balanced AUROC: 0.856
OverBoost | Balanced AUROC: 0.834
SmoteBoost | Balanced AUROC: 0.830
UnderBagging | Balanced AUROC: 0.941
OverBagging | Balanced AUROC: 0.944
SmoteBagging | Balanced AUROC: 0.955
Cascade | Balanced AUROC: 0.943
SelfPacedEns | Balanced AUROC: 0.949
#Classes: 11
Imbalance Ratio: 9.31/7.45/5.96/4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.949
RusBoost | Balanced AUROC: 0.830
OverBoost | Balanced AUROC: 0.789
SmoteBoost | Balanced AUROC: 0.795
UnderBagging | Balanced AUROC: 0.916
OverBagging | Balanced AUROC: 0.914
SmoteBagging | Balanced AUROC: 0.925
Cascade | Balanced AUROC: 0.911
SelfPacedEns | Balanced AUROC: 0.915
#Classes: 12
Imbalance Ratio: 11.64/9.31/7.45/5.96/4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.935
RusBoost | Balanced AUROC: 0.796
OverBoost | Balanced AUROC: 0.770
SmoteBoost | Balanced AUROC: 0.783
UnderBagging | Balanced AUROC: 0.891
OverBagging | Balanced AUROC: 0.907
SmoteBagging | Balanced AUROC: 0.909
Cascade | Balanced AUROC: 0.895
SelfPacedEns | Balanced AUROC: 0.890
#Classes: 13
Imbalance Ratio: 14.55/11.64/9.31/7.45/5.96/4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.938
RusBoost | Balanced AUROC: 0.790
OverBoost | Balanced AUROC: 0.772
SmoteBoost | Balanced AUROC: 0.785
UnderBagging | Balanced AUROC: 0.895
OverBagging | Balanced AUROC: 0.902
SmoteBagging | Balanced AUROC: 0.924
Cascade | Balanced AUROC: 0.902
SelfPacedEns | Balanced AUROC: 0.903
#Classes: 14
Imbalance Ratio: 18.19/14.55/11.64/9.31/7.45/5.96/4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.934
RusBoost | Balanced AUROC: 0.768
OverBoost | Balanced AUROC: 0.740
SmoteBoost | Balanced AUROC: 0.765
UnderBagging | Balanced AUROC: 0.889
OverBagging | Balanced AUROC: 0.889
SmoteBagging | Balanced AUROC: 0.915
Cascade | Balanced AUROC: 0.896
SelfPacedEns | Balanced AUROC: 0.889
#Classes: 15
Imbalance Ratio: 22.74/18.19/14.55/11.64/9.31/7.45/5.96/4.77/3.81/3.05/2.44/1.95/1.56/1.25/1.00
DuBE | Balanced AUROC: 0.929
RusBoost | Balanced AUROC: 0.776
OverBoost | Balanced AUROC: 0.739
SmoteBoost | Balanced AUROC: 0.754
UnderBagging | Balanced AUROC: 0.871
OverBagging | Balanced AUROC: 0.867
SmoteBagging | Balanced AUROC: 0.890
Cascade | Balanced AUROC: 0.871
SelfPacedEns | Balanced AUROC: 0.850
Results Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
all_results_columns = ['Method', 'AUROC (macro)', '#Classes']
data_vis = pd.DataFrame(all_results, columns=all_results_columns)
def plot_results_comp(data_vis, x, y, title, figsize=(8,6)):
fig = plt.figure(figsize=figsize)
ax = sns.lineplot(
data=data_vis, x=x, y=y, hue='Method', style='Method',
markers=True, err_style='bars', linewidth=4, markersize=20, alpha=0.9
)
for position, spine in ax.spines.items():
spine.set_color('black')
spine.set_linewidth(2)
ax.grid(color = 'black', linestyle='-.', alpha=0.3)
ax.set_ylabel('AUROC (macro)')
ax.set_title(title)
ax.legend(
title='',
borderpad=0.25,
columnspacing=0.05,
borderaxespad=0.15,
handletextpad=0.05,
labelspacing=0.05,
handlelength=1.2,
)
return ax
plot_results_comp(data_vis, x='#Classes', y='AUROC (macro)',
title='DuBE versus Ensemble Baselines (#Classes 3-15)')
Out:
<AxesSubplot:title={'center':'DuBE versus Ensemble Baselines (#Classes 3-15)'}, xlabel='#Classes', ylabel='AUROC (macro)'>
Total running time of the script: ( 1 minutes 22.885 seconds)
Estimated memory usage: 37 MB