Structured RerF Demo: Impulse Response

We form a two-class classification problem where each data point is 100 timesteps. Class 0 is simply white noise while class 1 is white noise plus an exponentially decaying unit impulse beginning at timestep 20.

We test the performance of S-Rerf and a set of other classification algorithms, training each on samples of sizes \(n \in \{50,100,200,400,1000,2000\}\), each containing an equal number of data points in each class. The average 0-1 loss is evaluated for each algorithm for each training size using a single test set of size \(m=10000\) with an equal number of data points in each class.

[156]:
import numpy as np
import pandas as pd
from scipy import stats

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
import seaborn as sns

import pickle
import time
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from rerf.rerfClassifier import rerfClassifier

import warnings
warnings.simplefilter('ignore')
[157]:
def samp(n, cls=0):
    ## Sample n samples from either class 0 or 1
    M0 = 20 # Onset of impulse response
    M1 = 100-M0
    func = lambda x: [np.exp(-0.1*(j-M0)) if j >= M0 else 0.0 for j in x] # Impulse response function
    if cls == 0:
        X = np.full((n,M0+M1),0.0)
        Y = np.full(n,cls)
    elif cls == 1:
        X = np.tile(func(np.arange(0,M0+M1)),reps=(n,1))
        Y = np.full(n,cls)

    # White noise
    X += np.random.normal(0,1,size=(n,M0+M1))

    return(X,Y)

def mixed_samp(n0,n1,shuffle=True):
    ## Generate a collection of mixed class (shuffled) observations
    X0,Y0 = samp(n0,cls=0)
    X1,Y1 = samp(n1,cls=1)
    X = np.vstack([X0,X1])
    Y = np.hstack([Y0,Y1])
    if shuffle:
        shuffled_idx= list(range(n0+n1)); np.random.shuffle(shuffled_idx)
        X = X[shuffled_idx]
        Y = Y[shuffled_idx]
    return(X,Y)
[181]:
np.random.seed(1234)
ns = np.array([50,100,200,400,1000,2000])
TRAIN_FRAC = 0.5
n0 = [int(i*TRAIN_FRAC) for i in ns]
n1 = n0

X_train, Y_train = np.empty(shape=(0,100)), np.empty(shape=(0))
for l0,l1 in zip(np.append(n0[0],np.diff(a=n0)),
                 np.append(n1[0],np.diff(a=n1))):
    x, y = mixed_samp(l0,l1)
    X_train = np.vstack((X_train, x))
    Y_train = np.hstack((Y_train, y))

TEST_SIZE = 10000
X_test, Y_test = mixed_samp(int(TEST_SIZE*0.5),int(TEST_SIZE*0.5))
[182]:
## Setup for run
names = {"Log. Reg": "blue", "Lin. SVM":"firebrick", "SVM":"purple",
          "kNN": "black", "RF":"#f86000", "MLP":"green", "S-RerF":"red"}

ncores=1
num_runs=1
n_est=100

classifiers = [
    LogisticRegression(random_state=0, n_jobs=ncores, solver='liblinear'),
    LinearSVC(),
    SVC(C=1.0, kernel='rbf', gamma='auto',random_state=0),
    KNeighborsClassifier(3, n_jobs=ncores),
    RandomForestClassifier(n_estimators=n_est, max_features='auto', n_jobs=ncores),
    MLPClassifier(hidden_layer_sizes=(100, ), random_state=0, max_iter=1000),
    rerfClassifier(projection_matrix="S-RerF",
                   max_features='auto',
                   n_jobs=ncores,
                    n_estimators=n_est,
                    oob_score=False,
                    random_state=0,
                    image_height=1,
                    image_width=100,
                    patch_height_max=1,
                    patch_height_min=1,
                    patch_width_max=20,
                    patch_width_min=2
                   )
    ]
[183]:
# Train each classifier on each dataset size, then test
## Prep output file:
f = open('s-rerf_impulse_experiment.csv', 'w+')
f.write("classifier,n,Lhat,trainTime,testTime,iterate\n")
f.flush()

runList = [(n, clf, run) for n in ns\
                       for clf in zip(classifiers, [key for key in names])\
                       for run in range(num_runs)]

for n, clf, iteration in tqdm(runList):
        X = X_train[:n]
        y = Y_train[:n]

        trainStartTime = time.time()
        clf[0].fit(X, y)
        trainEndTime = time.time()
        trainTime = trainEndTime - trainStartTime

        testStartTime = time.time()
        out = clf[0].predict(X_test)
        testEndTime = time.time()
        testTime = testEndTime - testStartTime

        lhat = np.mean(np.not_equal(out, Y_test).astype(int))


        ####("variable,Lhat,trainTime,testTime,iterate")
        f.write(f"{clf[1]}, {n}, {lhat:2.9f}, {trainTime:2.9f}, {testTime:2.9f}, {iteration}\n")
        f.flush()

f.close()
100%|██████████| 42/42 [01:38<00:00,  8.28s/it]
[170]:
dat = pd.read_csv("s-rerf_impulse_experiment.csv")

d1 = pd.DataFrame(columns = ['classifier', 'n', 'Lhat', 'color'])

k = 0
for ni in np.unique(dat['n']):
    for cl in np.unique(dat['classifier']):

        tmp = dat[np.logical_and(dat['classifier'] == cl,dat['n'] == ni)][['n', 'Lhat']]

        list(tmp.mean())
        d1.loc[k] = [cl] + list(tmp.mean()) + [names[cl]]
        k += 1


sns.set(style="darkgrid", rc={'figure.figsize':[12,8], 'figure.dpi': 300})
fig, ax = plt.subplots(figsize = (8,6))

for key in names.keys():
    grp = d1[d1['classifier'] == key]
    ax = grp.plot(ax=ax, kind='line', x='n', y='Lhat', label=key, \
            c = names[key], alpha =0.65)
    #ax.set_yscale('log')

plt.legend(loc='best',title='Algorithm')
plt.title('Algorithm Comparison')
plt.ylabel('Mean Test Error')
plt.xlabel('Number of Training Samples')
#plt.savefig('./s-rerf_impulse_experiment.pdf',dpi=300,format='pdf')
plt.show()
../../_images/demos_S-RerF_Impulse_Classification_using_S-RerF_6_0.png