Compare DuBE with resampling-based IL methods (5 classes)

In this example, we compare the duplebalance.DupleBalanceClassifier and other resampling-based class-imbalanced learning methods.

print(__doc__)

RANDOM_STATE = 42

Preparation

First, we will import necessary packages and generate an example multi-class imbalanced dataset.

from duplebalance import DupleBalanceClassifier
from duplebalance.baselines import ResampleClassifier
from duplebalance.base import sort_dict_by_key

import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

Make a 5-class imbalanced classification task

X, y = make_classification(n_classes=5, class_sep=1, # 5-class
    weights=[0.05, 0.05, 0.15, 0.25, 0.5], n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1, n_samples=2000, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

origin_distr = sort_dict_by_key(Counter(y_train))
test_distr = sort_dict_by_key(Counter(y_test))
print('Original training dataset shape %s' % origin_distr)
print('Original test dataset shape %s' % test_distr)

Out:

Original training dataset shape {0: 52, 1: 48, 2: 145, 3: 268, 4: 487}
Original test dataset shape {0: 48, 1: 52, 2: 155, 3: 232, 4: 513}

Train DuBE Classifier with different ensemble size

n_estimators_list = [1, 3, 5, 10, 20]

ensemble_init_kwargs = {
    'random_state': RANDOM_STATE,
}

eval_kwargs = {'average': 'macro', 'multi_class': 'ovo'}

BASECLF = DecisionTreeClassifier()

# Initialize results list
all_results = []

for n_estimators in n_estimators_list:
    # Train DuBE classifiers
    clf = DupleBalanceClassifier(
        base_estimator=BASECLF,
        n_estimators=n_estimators,
        **ensemble_init_kwargs
    ).fit(
        X_train, y_train,
        resampling_target='under',
        resampling_strategy='shem',
        perturb_alpha=.5,
        sample_weight=None,
        eval_datasets={'test': (X_test, y_test)},
        train_verbose=False,
    )
    y_pred_proba = clf.predict_proba(X_test)
    score = roc_auc_score(y_test, y_pred_proba, **eval_kwargs)
    print ("DuBE {:<2d} | Balanced AUROC: {:.3f} | #Training Samples: {:d}".format(
        n_estimators, score, sum(clf.estimators_n_training_samples_)
        ))
    all_results.append(
        ['DuBE', score, sum(clf.estimators_n_training_samples_)]
    )

Out:

DuBE 1  | Balanced AUROC: 0.869 | #Training Samples: 240
DuBE 3  | Balanced AUROC: 0.945 | #Training Samples: 720
DuBE 5  | Balanced AUROC: 0.965 | #Training Samples: 1200
DuBE 10 | Balanced AUROC: 0.977 | #Training Samples: 2400
DuBE 20 | Balanced AUROC: 0.984 | #Training Samples: 4800

Train Resampling-based IL Classifiers

from imblearn.under_sampling import *
from imblearn.over_sampling import *

resamp_init_kwargs = {'random_state': 42}

samplers = {
    'No-resampling': None,
    'RUS': RandomUnderSampler(**resamp_init_kwargs),
    'TomekLinks': TomekLinks(),
    'NearMiss': NearMiss(),
    'Condense': CondensedNearestNeighbour(**resamp_init_kwargs),
    'ROS': RandomOverSampler(**resamp_init_kwargs),
    'SMOTE': SMOTE(**resamp_init_kwargs),
    'ADASYN': ADASYN(**resamp_init_kwargs),
    'BorderSMOTE': BorderlineSMOTE(**resamp_init_kwargs),
}

# Train all resampling-based imbalanced learning methods
for sampler_name, sampler in samplers.items():
    clf = ResampleClassifier(
        base_estimator=BASECLF,
        sampler=sampler
    ).fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)
    score = roc_auc_score(y_test, y_pred_proba, **eval_kwargs)
    print (sampler_name, score, clf.n_training_samples_)

    all_results.append(
        [sampler_name, score, clf.n_training_samples_]
    )

Out:

No-resampling 0.8828851091981544 1000
RUS 0.8727953972366624 240
TomekLinks 0.8804842564696351 920
NearMiss 0.8366551576942509 240
Condense 0.8081403968288517 227
ROS 0.8561541415871521 2435
SMOTE 0.8617858533127144 2435
ADASYN 0.8698936401528098 2390
BorderSMOTE 0.8392594246977406 2435

Results Visualization

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')

def plot_results_comp(data_vis, x, y, title, figsize=(8,6)):
    fig = plt.figure(figsize=figsize)
    ax = sns.lineplot(
        data=data_vis, x=x, y=y, hue='Method', style='Method',
        markers=True, err_style='bars', linewidth=4, markersize=20, alpha=0.9
    )
    for position, spine in ax.spines.items():
        spine.set_color('black')
        spine.set_linewidth(2)
    ax.grid(color = 'black', linestyle='-.', alpha=0.3)
    ax.set_ylabel('AUROC (macro)')
    ax.set_title(title)
    ax.legend(
        title='',
        borderpad=0.25,
        columnspacing=0.05,
        borderaxespad=0.15,
        handletextpad=0.05,
        labelspacing=0.05,
        handlelength=1.2,
        )
    return ax

all_results_columns = ['Method', 'AUROC (macro)', '#Training Samples']
data_vis = pd.DataFrame(all_results, columns=all_results_columns)
plot_results_comp(data_vis, x='#Training Samples', y='AUROC (macro)',
                  title='DuBE versus Resampling Baselines')
DuBE versus Resampling Baselines

Out:

<AxesSubplot:title={'center':'DuBE versus Resampling Baselines'}, xlabel='#Training Samples', ylabel='AUROC (macro)'>

Total running time of the script: ( 0 minutes 44.053 seconds)

Estimated memory usage: 15 MB

Gallery generated by Sphinx-Gallery