import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import tensorflow.keras as kr
import seaborn as sns

dataset = pd.read_csv('Cancer.csv');

#Rename Feature Names for Readability
col_names=[];
for i in range(0,len(dataset.columns)):
    col_names.append('$x$' + f'${str(i)}$')
dataset.columns = col_names

#Show Dataset
dataset

seed = 34;

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Columns: 4001 entries, $x$$0$ to $x$$4000$
dtypes: float64(4000), object(1)
memory usage: 6.6+ MB

dataset.describe()

# Too Large!

# import warnings
# warnings.filterwarnings('ignore')
# with warnings.catch_warnings():      #Catch warnings in code section
#     warnings.simplefilter("ignore")
#     plt.subplots(figsize=(15,60));
#     ax = plt.gca();
#     dataset.hist(bins=30, figsize=(1,1), grid=False, layout=(90,90), sharex=False, ax=ax, alpha=0.5);
#     plt.tight_layout();

print(f'There are {dataset.isnull().sum().sum()} null values in the dataset.')
print(f'There are {dataset.isna().sum().sum()} NaN values in the dataset.')

#No need to impute.

There are 0 null values in the dataset.
There are 0 NaN values in the dataset.

corr_mat = dataset.corr(method='pearson');
#mask = np.triu(np.ones_like(corr_mat, dtype=bool));
plt.figure(dpi=300);
plt.subplots(figsize=(20,20));
plt.title("Pearson's R Correlation Matrix");
sns.heatmap(corr_mat, annot=False, lw=0, linecolor='white', cmap='inferno');
print('Too many features to visualize at once!')

Too many features to visualize at once!

<Figure size 1800x1200 with 0 Axes>

x = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values
# pd.DataFrame(x)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder();
y = le.fit_transform(y);

from sklearn.decomposition import PCA
n_comp = 2; ## bw 0<val>1 -> explained var ratio, othw: bw 0 and min(n_samples, n_features)
pca = PCA(n_components=n_comp, random_state=seed);
pca.fit(x);
print(f'Using {n_comp} Principal Components retains {round(pca.explained_variance_ratio_.sum() * 100, 3)}% of the data.')

Using 2 Principal Components retains 85.775% of the data.

xPC = pca.transform(x);
pd.DataFrame(xPC)

y_labeled = le.inverse_transform(y);
ax1 = sns.scatterplot(x=xPC[:,0], y=xPC[:,1], hue=y_labeled);
ax1.set(title='Principal Components of the Cancer Dataset',
        ylabel='Principal Component 2',
        xlabel='Principal Component 1');

from sklearn.decomposition import PCA
n_comp = 10; ## bw 0<val>1 -> explained var ratio, othw: bw 0 and min(n_samples, n_features)
pca2 = PCA(n_components=n_comp, random_state=seed, svd_solver='full');
pca2.fit(x);

print(f'Using {n_comp} Principal Components retains {round(pca2.explained_variance_ratio_.sum() * 100, 3)}% of the data.')

Using 10 Principal Components retains 97.216% of the data.

xPC = pca2.transform(x);
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xPC, y, test_size = 0.2, random_state = seed)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train) #Fit the scaler ONLY to training data
x_test = sc.transform(x_test) #Transform (w/o fitting) the testing data

pc_title=[];
for i in range(1,11):
    pc_title.append(f'Principal Component {i}');

import warnings
warnings.filterwarnings('ignore')
with warnings.catch_warnings():      #Catch warnings in code section
    warnings.simplefilter("ignore")
    plt.subplots(figsize=(10,10));
    ax = plt.gca();
    pd.DataFrame(xPC).hist(bins=30, figsize=(1,1), grid=False, layout=(5,2), sharex=False, ax=ax, alpha=0.5);
    plt.tight_layout();

corr_mat = pd.DataFrame(xPC).corr(method='pearson');
mask = np.triu(np.ones_like(corr_mat, dtype=bool));
plt.figure(dpi=300);
plt.subplots(figsize=(7,5));
plt.title("Pearson's R Correlation Matrix");
sns.heatmap(corr_mat, annot=False, lw=0.5, linecolor='white', cmap='inferno', mask=mask);

<Figure size 1800x1200 with 0 Axes>

ANNmodel = kr.models.Sequential([
    kr.layers.Dense(10, activation='relu'),
    kr.layers.Dropout(0.2),
    kr.layers.Dense(5, activation='relu'),
    kr.layers.Dense(1, activation='sigmoid')
    ])

#Compile
ANNmodel.compile(optimizer = 'sgd', loss = 'binary_crossentropy', metrics = ['accuracy']);

#Callbacks
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='accuracy', mode='max', verbose=1, patience=50)

#Fit
ANNmodel.fit(x_train, y_train, batch_size = 50, epochs = 100, verbose=0, callbacks=es);

loacc = ANNmodel.evaluate(x=x_test, y=y_test, verbose=0);
lab = ANNmodel.metrics_names;
for i in range(0,len(lab)):
    print(f'Model\'s {lab[i]} is {round(loacc[i], 4)}.')

Model's loss is 0.412.
Model's accuracy is 0.9302.

from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

#Suppress warnings for non-convergent ANN models
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Function to create model, required for KerasClassifier
def create_model(optimizer='adam', epochs=100, batch_size=50, neurons=10):
    # create model
    model = kr.models.Sequential([
                                  kr.layers.Dense(neurons, activation='relu'),
                                  kr.layers.Dropout(0.2),
                                  kr.layers.Dense(neurons/2, activation='relu'),
                                  kr.layers.Dense(1, activation='sigmoid')
                                ])
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model;

# create model and pass all arguments to be optimized
model = (KerasClassifier(build_fn=create_model, epochs=50, batch_size=50, verbose=0));

# Define all parameters to be searched for in grid
param_grid = {
              'epochs' : [100, 75, 50, 25],
              'batch_size': [25, 50, 100],
              'optimizer' : ['adam', 'sgd'],
              'neurons' : [10, 15, 20, 25]
             }

# Create the GridSearchCV object
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1);

# Fit to data
grid_result = grid.fit(x_train, y_train);

#Print results
print("Best Accuracy: %f using %s" % (round(grid_result.best_score_ * 100,3), grid_result.best_params_));

models = [];

def create_model(optimizer='adam', epochs=100, batch_size=50, neurons=10):
    # create model
    model = kr.models.Sequential([
                                  kr.layers.Dense(neurons, activation='relu'),
                                  kr.layers.Dropout(0.2),
                                  kr.layers.Dense(neurons/2, activation='relu'),
                                  kr.layers.Dense(1, activation='sigmoid')
                                ])

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model;

ANNmodelOptimized = create_model(optimizer='adam', epochs=100, batch_size=25, neurons=25);
ANNmodelOptimized.fit(x_train, y_train, epochs=100, batch_size=25, callbacks=es, verbose=0);

models.append(('ANN', ANNmodelOptimized));

#Wrap model for use with SKLearn tools
models[0] = ('ANN', KerasClassifier(build_fn=create_model, epochs=100, batch_size=25, verbose=0, neurons=25, optimizer='adam'));

from sklearn.linear_model import LogisticRegression
LRmodel = LogisticRegression(solver='newton-cg'); #Use this solver to avoid convergence issues with lbfgs
LRmodel.fit(x_train, y_train);
models.append(('LR',LRmodel));

from sklearn.ensemble import RandomForestClassifier
RFCmodel = RandomForestClassifier(n_estimators=100); #N_estimators and criterion can be optimized.
RFCmodel.fit(x_train, y_train);
models.append(('RF', RFCmodel));

from sklearn.naive_bayes import GaussianNB
gaussNBmodel = GaussianNB(); #Epsilon can be optimized?
gaussNBmodel.fit(x_train, y_train);
models.append(('NB', gaussNBmodel));

from sklearn.neighbors import KNeighborsClassifier
KNNmodel = KNeighborsClassifier(n_neighbors=5, p=2); #K can be optimized
KNNmodel.fit(x_train, y_train);
models.append(('KNN', KNNmodel));

from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, accuracy_score

allac=[];
results = [];
for (name, model) in models:
    if (name == 'ANN'):
        y_pred = (ANNmodelOptimized.predict(x_test) > 0.5);
    else:
        y_pred = (model.predict(x_test) > 0.5);
    cm = confusion_matrix((y_test > 0.5), y_pred);
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test,(y_pred)))
    ac = accuracy_score(y_test, y_pred);
    results.append( (name,ac,cm,disp, y_pred) );
    allac.append(ac);
    

for (name,ac,cm,disp, yp) in results:
    disp.plot();
    plt.title(f'Confusion Matrix for ${name}$, Accuracy={round(ac*100,2)}%');

names = [];
for tp in models:
    names.append(tp[0]);
    
sns.barplot(x=names,y=allac, palette='magma');
plt.title('ML Model Accuracy Comparison');
plt.ylabel('Accuracy');
plt.xlabel('ML Algorithm');

# Number of splits to make.
N = 3;

from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold

CV_results = [];
scoring = 'accuracy';

import warnings
warnings.filterwarnings('ignore')
with warnings.catch_warnings():      #Catch warnings in code section
    warnings.simplefilter("ignore")
    for tp in models:
        kfold = StratifiedKFold(n_splits=N, shuffle=True)
        #kfold = model_selection.KFold(n_splits=N);
        CVinternal_results = model_selection.cross_val_score(tp[1], x, y, cv=kfold, scoring=scoring);
        CV_results.append((CVinternal_results));

names = [];
for tp in models:
    names.append(tp[0]);

CV_results = pd.DataFrame(CV_results).T;
CV_results.columns = names;

ax2 = sns.boxplot(data=CV_results, palette='Spectral')
ax2.set(xlabel = "ML Algorithm",
       ylabel = 'Accuracy',
       title = f"ML Algorithm Accuracy Comparison \nfor Cross-Validation with {N} Splits");
sns.despine(ax=ax2,offset=5, trim=False)
ax2.plot();

ax3 = sns.boxplot(data=CV_results, palette='Spectral')
ax3.set(xlabel = "ML Algorithm",
       ylabel = 'Accuracy',
       title = f"ML Algorithm Accuracy Comparison\nunder Cross-Validation with {N} Splits\n(Top Performers)");
sns.despine(ax=ax3,offset=5, trim=False)
plt.ylim(0.90,1)
ax3.plot();

g=sns.displot(data=CV_results, kind='kde');
g.despine(offset=5);
g.set_xlabels('Accuracy');
plt.title('K-Fold Cross Validation Accuracy Distribution\n for High-Accuracy Algorithms');
plt.xlim(0.90,1);

	$x$$0$	$x$$1$	$x$$2$	$x$$3$	$x$$4$	$x$$5$	$x$$6$	$x$$7$	$x$$8$	$x$$9$	...	$x$$3991$	$x$$3992$	$x$$3993$	$x$$3994$	$x$$3995$	$x$$3996$	$x$$3997$	$x$$3998$	$x$$3999$	$x$$4000$
0	0.025409	0.051085	0.056305	0.021738	0.027410	0.014914	0.022455	0.023957	0.060527	0.047382	...	0.055033	0.080864	0.053423	0.051942	0.013187	0.028573	0.020427	0.023261	0.019975	C
1	0.025536	0.036123	0.054195	0.009735	0.027521	0.052255	0.042812	0.069087	0.069873	0.066629	...	0.033783	0.029022	0.046397	0.033288	0.041889	0.019256	-0.009447	0.021481	0.025569	C
2	0.012817	0.029652	0.079290	0.050677	0.039737	0.057713	0.044492	0.034581	0.042587	0.034147	...	0.036083	0.038598	0.048881	0.025569	0.026710	0.025122	0.047466	0.046706	0.043482	C
3	0.019846	-0.010577	-0.007504	0.019042	0.068786	0.061764	0.039036	0.020445	0.025988	0.066716	...	0.032044	0.026320	0.072016	0.070145	0.055744	0.051084	0.036683	0.043729	0.040289	C
4	0.039048	0.039355	0.001343	0.026221	0.044091	0.043953	0.039629	0.047926	0.046892	0.030589	...	0.065494	0.030681	0.039686	0.037256	0.022888	0.056221	0.055819	0.010087	0.006004	C
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
210	0.019997	0.002927	0.006809	-0.003585	0.026362	0.026540	0.026112	0.026230	0.021676	0.024205	...	0.018509	0.013225	0.015765	0.018762	0.012212	0.009340	0.020955	0.010685	0.013423	N
211	0.042346	0.031884	0.049617	0.031419	0.042043	0.033383	0.054695	0.079029	0.063147	0.040817	...	0.019066	0.038845	0.035201	0.013012	0.032180	0.026465	0.017850	0.036014	0.018276	N
212	0.023558	0.021331	0.016210	0.012324	0.022074	0.029829	0.032624	0.022100	0.028950	0.037769	...	0.031909	0.019024	0.024298	0.032061	0.009901	0.011709	0.008274	0.004742	0.024756	N
213	0.028351	0.023266	0.004556	0.024095	0.018943	0.025935	0.019066	0.037213	0.041892	0.031092	...	0.012613	0.031370	0.030285	0.034522	0.024089	0.006737	0.010033	0.017391	0.031537	N
214	0.027428	0.027021	0.015273	0.026199	0.018178	0.022988	0.021955	0.044174	0.034616	0.030664	...	0.040424	0.011187	0.007513	0.020666	0.040298	0.021256	0.026642	0.027718	0.040418	N

	$x$$0$	$x$$1$	$x$$2$	$x$$3$	$x$$4$	$x$$5$	$x$$6$	$x$$7$	$x$$8$	$x$$9$	...	$x$$3990$	$x$$3991$	$x$$3992$	$x$$3993$	$x$$3994$	$x$$3995$	$x$$3996$	$x$$3997$	$x$$3998$	$x$$3999$
count	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	...	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000	215.000000
mean	0.035385	0.034011	0.031153	0.029453	0.030437	0.031630	0.037983	0.051983	0.053108	0.043813	...	0.031454	0.030414	0.026837	0.028083	0.030159	0.029051	0.030198	0.029265	0.028464	0.028291
std	0.020965	0.023742	0.020610	0.018792	0.017193	0.019584	0.020329	0.025393	0.025899	0.022097	...	0.015441	0.016721	0.017868	0.017953	0.016342	0.014756	0.016783	0.017017	0.016603	0.016924
min	-0.004115	-0.022688	-0.016533	-0.006113	-0.010742	-0.009920	-0.010414	0.006379	0.009280	0.007561	...	0.005392	0.000822	-0.013180	-0.000679	0.002361	-0.009846	-0.001082	-0.009447	-0.000229	-0.000048
25%	0.022663	0.018208	0.017315	0.016656	0.020933	0.018883	0.022636	0.034256	0.036876	0.030309	...	0.020895	0.018226	0.015529	0.016985	0.018969	0.018634	0.018542	0.019116	0.017654	0.017240
50%	0.032690	0.029299	0.028017	0.027144	0.027521	0.028011	0.035989	0.047926	0.048935	0.038779	...	0.028658	0.026214	0.023723	0.026230	0.027577	0.027533	0.027207	0.026541	0.025697	0.025569
75%	0.044659	0.044359	0.042087	0.038161	0.039281	0.041763	0.049378	0.064891	0.066106	0.055570	...	0.039772	0.039735	0.034826	0.036069	0.038600	0.037986	0.039789	0.036286	0.036866	0.036661
max	0.176470	0.132940	0.114660	0.123630	0.099995	0.120050	0.113420	0.142060	0.212770	0.167180	...	0.080878	0.086974	0.113020	0.123710	0.101980	0.091268	0.093152	0.107750	0.113910	0.101880

	0	1
0	9.266718	0.301442
1	19.951636	-0.488083
2	12.567393	-0.776992
3	29.648526	9.235698
4	22.292447	9.280581
...	...	...
210	-9.338249	-4.617745
211	-2.877763	-1.046576
212	-0.558223	-5.645360
213	-3.872328	-6.608229
214	-0.113340	2.167075

Cancer Classifier¶

Description¶

General Setup¶

Importing Libraries¶

Data Import¶

Random Seed¶

Data Exploration¶

General Information¶

Descriptive Statistics for each Feature¶

Data Distribution¶

Checking for Invalid Inputs¶

General Correlations¶

Data Preprocessing¶

I/O Separation¶

Encoding Dependent Variable¶

[Part A] Principal Component Analysis (PCA)¶

Get Top 2 Principal Components¶

Transform Input Data¶

Plot of Principal Components (PC1-PC2)¶

[Part B] Principal Component Analysis (PCA)¶

Get Top 10 Principal Components¶

Evaluate Explained Variance (Data Retention)¶

Train/Test Split on PCs¶

Feature Scaling¶

Reattempting Data Exploration on Principal Components¶

Data Distributions¶

General Correlations¶

[Part C] Artificial Neural Network (ANN) Model¶

Build and Compile the Model¶

Fit Model with Early Stopping¶

Evaluate the Model¶

Model Optimization¶

Comparing to Other Models¶

Optimized ANN Model¶

Additional Models¶

Logistic Regression (LR)¶

Random Forest¶

Gaussian Naive Bayes¶

KNN¶

Model Performance Comparison¶

Via Confusion Matrices¶

Single Run Accuracy¶

Via K-Fold Cross Validation¶

Conclusion¶