import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import seaborn as sns

dataset = pd.read_csv('SemiconductorManufacturingProcessDataset.csv')

pd.DataFrame(dataset)
#dataset

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 439 entries, Time to Pass/Fail
dtypes: float64(437), object(2)
memory usage: 5.2+ MB

dataset.describe()

import warnings
warnings.filterwarnings('ignore')
with warnings.catch_warnings():      #Catch warnings in code section
    warnings.simplefilter("ignore")
    plt.subplots(figsize=(15,60));
    ax = plt.gca();
    dataset.hist(bins=30, figsize=(10,10), grid=False, layout=(146,3), sharex=False, ax=ax, alpha=0.5);
    plt.tight_layout();

corr_mat = dataset.corr(method='pearson');
mask = np.triu(np.ones_like(corr_mat, dtype=bool));
plt.figure(dpi=1000);
plt.subplots(figsize=(20,15));
plt.title("Pearson's R Correlation Matrix");
sns.heatmap(corr_mat, annot=False, lw=0, linecolor='white', cmap='YlGnBu');
print();

<Figure size 6000x4000 with 0 Axes>

X = dataset.iloc[:, 1:438].values
y = dataset.iloc[:, -1].values

pd.DataFrame(X)

pd.DataFrame(y).T

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X = imputer.transform(X)

# A quick check
print(X)

[[3.03093000e+03 2.56400000e+03 2.18773330e+03 ... 1.64749042e-02
  5.28333333e-03 9.96700663e+01]
 [3.09578000e+03 2.46514000e+03 2.23042220e+03 ... 2.01000000e-02
  6.00000000e-03 2.08204500e+02]
 [2.93261000e+03 2.55994000e+03 2.18641110e+03 ... 4.84000000e-02
  1.48000000e-02 8.28602000e+01]
 ...
 [2.97881000e+03 2.37978000e+03 2.20630000e+03 ... 8.60000000e-03
  2.50000000e-03 4.35231000e+01]
 [2.89492000e+03 2.53201000e+03 2.17703330e+03 ... 2.45000000e-02
  7.50000000e-03 9.34941000e+01]
 [2.94492000e+03 2.45076000e+03 2.19544440e+03 ... 1.62000000e-02
  4.50000000e-03 1.37784400e+02]]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# a qucik check
print(y)

[1 1 0 ... 1 1 1]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 34)

#print(X_train)
#print(X_test)
#print(y_train)
#print(y_test)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train) #Fit the scaler ONLY to training data
X_test = sc.transform(X_test) #Transform (w/o fitting) the testing data

#Model Database (Array)
models = [];

from sklearn.linear_model import LogisticRegression
LRmodel = LogisticRegression(solver='newton-cg'); #Use this solver to avoid convergence issues with lbfgs
LRmodel.fit(X_train, y_train);
models.append(('LR',LRmodel));

n = pd.DataFrame(X).columns.stop;

#Model
ANNmodel = tf.keras.models.Sequential();

#Layers
ANNmodel.add(tf.keras.layers.Dense(units=n, activation='relu'));
ANNmodel.add(tf.keras.layers.Dense(units=n/2, activation='relu'));
ANNmodel.add(tf.keras.layers.Dense(units=1, activation='sigmoid'));

#Compile
ANNmodel.compile(optimizer = 'sgd', loss = 'binary_crossentropy', metrics = ['accuracy']);

#Fit
ANNmodel.fit(X_train, y_train, batch_size = 100, epochs = 50, verbose=0);

models.append(('ANN', ANNmodel));

from sklearn.ensemble import RandomForestClassifier
RFCmodel = RandomForestClassifier(n_estimators=100); #N_estimators and criterion can be optimized.
RFCmodel.fit(X_train, y_train);
models.append(('RF', RFCmodel));

from sklearn.naive_bayes import GaussianNB
gaussNBmodel = GaussianNB(); #Epsilon can be optimized?
gaussNBmodel.fit(X_train, y_train);
models.append(('NB', gaussNBmodel));

from sklearn.neighbors import KNeighborsClassifier
KNNmodel = KNeighborsClassifier(n_neighbors=5, p=2); #K can be optimized
KNNmodel.fit(X_train, y_train);
models.append(('KNN', KNNmodel));

from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, accuracy_score
  
allac=[];
results = [];
for (name, model) in models:
    y_pred = (model.predict(X_test) > 0.5);
    cm = confusion_matrix((y_test > 0.5), y_pred);
    disp = ConfusionMatrixDisplay(confusion_matrix(y_test,(model.predict(X_test)>0.5)))
    ac = accuracy_score(y_test, y_pred);
    results.append( (name,ac,cm,disp, y_pred) );
    allac.append(ac);
    

for (name,ac,cm,disp, yp) in results:
    disp.plot();
    plt.title(f'Confusion Matrix for Model: {name}')

names = [];
for tp in models:
    names.append(tp[0]);
    
sns.barplot(x=names,y=allac, palette='magma');
plt.title('Model Accuracy Comparison');
plt.ylabel('Accuracy');
plt.xlabel('ML Algorithm');

from sklearn.metrics import mean_squared_error
RMSE_results=[];
for tp in results:
    RMSE_results.append(mean_squared_error(y_test, tp[-1]))
sns.barplot(x=names, y=RMSE_results, palette='magma');
plt.title('RMSE Across all Models');
plt.ylabel('RMSE (Lower is Better)');
plt.xlabel('ML Algorithm');

sns.barplot(x=names, y=(RMSE_results/np.max(RMSE_results))**-1, palette='magma');
plt.title('Proportionally Scaled RMSE Across all Models');
plt.ylabel('Scaled RMSE (Higher is Better)');
plt.xlabel('ML Algorithm');

#Wrap ANN for sklearn.
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
ANNmodel_wrapped = KerasClassifier(ANNmodel);


# Function to create model, required for KerasClassifier
def create_model():
    # create model
    model = tf.keras.models.Sequential();
    model.add(tf.keras.layers.Dense(n, activation='relu'));
    model.add(tf.keras.layers.Dense(n/2, activation='relu'));
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'));
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model;

models[1] = ('ANN', KerasClassifier(build_fn=create_model, epochs=100, batch_size=150, verbose=0));



# Number of splits to make.
N = 3;

from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold

CV_results = [];
scoring = 'accuracy';

for tp in models:
    kfold = StratifiedKFold(n_splits=N, shuffle=True)
    #kfold = model_selection.KFold(n_splits=N);
    CVinternal_results = model_selection.cross_val_score(tp[1], X, y, cv=kfold, scoring=scoring);
    CV_results.append((CVinternal_results));

CV_results = pd.DataFrame(CV_results).T;
CV_results.columns = names;

ax2 = sns.boxplot(data=CV_results, palette='Spectral')
ax2.set(xlabel = "ML Algorithm",
       ylabel = 'Accuracy',
       title = f"ML Algorithm Accuracy Comparison over {N}-fold Cross Validation");
sns.despine(ax=ax2,offset=5, trim=True)
ax2.plot();

ax2 = sns.boxplot(data=CV_results, palette='Spectral')
ax2.set(xlabel = "ML Algorithm",
       ylabel = 'Accuracy',
       title = f"ML Algorithm Accuracy Comparison over {N}-fold Cross Validation \n(zoomed-in)");
ax2.set(ylim=(0.85, 0.95))
sns.despine(ax=ax2,offset=5, trim=False)
ax2.plot();

g=sns.displot(data=CV_results, kind='kde');
g.despine(offset=5);
g.set_xlabels('Accuracy');
plt.title('K-Fold Cross Validation Accuracy Distribution for all Algorithms');
plt.xlim(0.0,1);

g=sns.displot(data=CV_results, kind='kde');
g.despine(offset=5);
g.set_xlabels('Accuracy');
plt.title('K-Fold Cross Validation Accuracy Distribution for High-Accuracy Algorithms');
plt.xlim(0.85,1);

	Time	Sensor 1	Sensor 2	Sensor 3	Sensor 4	Sensor 5	Sensor 6	Sensor 7	Sensor 8	Sensor 9	...	Sensor 429	Sensor 430	Sensor 431	Sensor 432	Sensor 433	Sensor 434	Sensor 435	Sensor 436	Sensor 437	Pass/Fail
0	7/19/2008 11:55	3030.93	2564.00	2187.7333	1411.1265	1.3602	97.6133	0.1242	1.5005	0.0162	...	14.9509	0.5005	0.0118	0.0035	2.3630	NaN	NaN	NaN	NaN	Pass
1	7/19/2008 12:32	3095.78	2465.14	2230.4222	1463.6606	0.8294	102.3433	0.1247	1.4966	-0.0005	...	10.9003	0.5019	0.0223	0.0055	4.4447	0.0096	0.0201	0.0060	208.2045	Pass
2	7/19/2008 13:17	2932.61	2559.94	2186.4111	1698.0172	1.5102	95.4878	0.1241	1.4436	0.0041	...	9.2721	0.4958	0.0157	0.0039	3.1745	0.0584	0.0484	0.0148	82.8602	Fail
3	7/19/2008 14:43	2988.72	2479.90	2199.0333	909.7926	1.3204	104.2367	0.1217	1.4882	-0.0124	...	8.5831	0.4990	0.0103	0.0025	2.0544	0.0202	0.0149	0.0044	73.8432	Pass
4	7/19/2008 15:22	3032.24	2502.87	2233.3667	1326.5200	1.5334	100.3967	0.1235	1.5031	-0.0031	...	10.9698	0.4800	0.4766	0.1045	99.3032	0.0202	0.0149	0.0044	73.8432	Pass
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1562	10/16/2008 15:13	2899.41	2464.36	2179.7333	3085.3781	1.4843	82.2467	0.1248	1.3424	-0.0045	...	11.7256	0.4988	0.0143	0.0039	2.8669	0.0068	0.0138	0.0047	203.1720	Pass
1563	10/16/2008 20:49	3052.31	2522.55	2198.5667	1124.6595	0.8763	98.4689	0.1205	1.4333	-0.0061	...	17.8379	0.4975	0.0131	0.0036	2.6238	0.0068	0.0138	0.0047	203.1720	Pass
1564	10/17/2008 5:26	2978.81	2379.78	2206.3000	1110.4967	0.8236	99.4122	0.1208	NaN	NaN	...	17.7267	0.4987	0.0153	0.0041	3.0590	0.0197	0.0086	0.0025	43.5231	Pass
1565	10/17/2008 6:01	2894.92	2532.01	2177.0333	1183.7287	1.5726	98.7978	0.1213	1.4622	-0.0072	...	19.2104	0.5004	0.0178	0.0038	3.5662	0.0262	0.0245	0.0075	93.4941	Pass
1566	10/17/2008 6:07	2944.92	2450.76	2195.4444	2914.1792	1.5978	85.1011	0.1235	NaN	NaN	...	22.9183	0.4987	0.0181	0.0040	3.6275	0.0117	0.0162	0.0045	137.7844	Pass

	Sensor 1	Sensor 2	Sensor 3	Sensor 4	Sensor 5	Sensor 6	Sensor 7	Sensor 8	Sensor 9	Sensor 10	...	Sensor 428	Sensor 429	Sensor 430	Sensor 431	Sensor 432	Sensor 433	Sensor 434	Sensor 435	Sensor 436	Sensor 437
count	1561.000000	1560.000000	1553.000000	1553.000000	1553.000000	1553.000000	1558.000000	1565.000000	1565.000000	1565.000000	...	1567.000000	1567.000000	1566.000000	1566.000000	1566.000000	1566.000000	1566.000000	1566.000000	1566.000000	1566.000000
mean	3014.452896	2495.850231	2200.547318	1396.376627	4.197013	101.112908	0.121822	1.462862	-0.000841	0.000146	...	5.563747	16.642363	0.500096	0.015318	0.003847	3.067826	0.021458	0.016475	0.005283	99.670066
std	73.621787	80.407705	29.513152	441.691640	56.355540	6.237214	0.008961	0.073897	0.015116	0.009302	...	16.921369	12.485267	0.003404	0.017180	0.003720	3.578033	0.012358	0.008808	0.002867	93.891919
min	2743.240000	2158.750000	2060.660000	0.000000	0.681500	82.131100	0.000000	1.191000	-0.053400	-0.034900	...	0.663600	4.582000	0.477800	0.006000	0.001700	1.197500	-0.016900	0.003200	0.001000	0.000000
25%	2966.260000	2452.247500	2181.044400	1081.875800	1.017700	97.920000	0.121100	1.411200	-0.010800	-0.005600	...	1.408450	11.501550	0.497900	0.011600	0.003100	2.306500	0.013425	0.010600	0.003300	44.368600
50%	3011.490000	2499.405000	2201.066700	1285.214400	1.316800	101.512200	0.122400	1.461600	-0.001300	0.000400	...	1.624500	13.817900	0.500200	0.013800	0.003600	2.757650	0.020500	0.014800	0.004600	71.900500
75%	3056.650000	2538.822500	2218.055500	1591.223500	1.525700	104.586700	0.123800	1.516900	0.008400	0.005900	...	1.902000	17.080900	0.502375	0.016500	0.004100	3.295175	0.027600	0.020300	0.006400	114.749700
max	3356.350000	2846.440000	2315.266700	3715.041700	1114.536600	129.252200	0.128600	1.656400	0.074900	0.053000	...	90.423500	96.960100	0.509800	0.476600	0.104500	99.303200	0.102800	0.079900	0.028600	737.304800

	0	1	2	3	4	5	6	7	8	9	...	427	428	429	430	431	432	433	434	435	436
0	3030.93	2564.00	2187.7333	1411.1265	1.3602	97.6133	0.1242	1.5005	0.0162	-0.0034	...	1.6765	14.9509	0.5005	0.0118	0.0035	2.3630	NaN	NaN	NaN	NaN
1	3095.78	2465.14	2230.4222	1463.6606	0.8294	102.3433	0.1247	1.4966	-0.0005	-0.0148	...	1.1065	10.9003	0.5019	0.0223	0.0055	4.4447	0.0096	0.0201	0.0060	208.2045
2	2932.61	2559.94	2186.4111	1698.0172	1.5102	95.4878	0.1241	1.4436	0.0041	0.0013	...	2.0952	9.2721	0.4958	0.0157	0.0039	3.1745	0.0584	0.0484	0.0148	82.8602
3	2988.72	2479.90	2199.0333	909.7926	1.3204	104.2367	0.1217	1.4882	-0.0124	-0.0033	...	1.7585	8.5831	0.4990	0.0103	0.0025	2.0544	0.0202	0.0149	0.0044	73.8432
4	3032.24	2502.87	2233.3667	1326.5200	1.5334	100.3967	0.1235	1.5031	-0.0031	-0.0072	...	1.6597	10.9698	0.4800	0.4766	0.1045	99.3032	0.0202	0.0149	0.0044	73.8432
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1562	2899.41	2464.36	2179.7333	3085.3781	1.4843	82.2467	0.1248	1.3424	-0.0045	-0.0057	...	1.4879	11.7256	0.4988	0.0143	0.0039	2.8669	0.0068	0.0138	0.0047	203.1720
1563	3052.31	2522.55	2198.5667	1124.6595	0.8763	98.4689	0.1205	1.4333	-0.0061	-0.0093	...	1.0187	17.8379	0.4975	0.0131	0.0036	2.6238	0.0068	0.0138	0.0047	203.1720
1564	2978.81	2379.78	2206.3000	1110.4967	0.8236	99.4122	0.1208	NaN	NaN	NaN	...	1.2237	17.7267	0.4987	0.0153	0.0041	3.0590	0.0197	0.0086	0.0025	43.5231
1565	2894.92	2532.01	2177.0333	1183.7287	1.5726	98.7978	0.1213	1.4622	-0.0072	0.0032	...	1.7085	19.2104	0.5004	0.0178	0.0038	3.5662	0.0262	0.0245	0.0075	93.4941
1566	2944.92	2450.76	2195.4444	2914.1792	1.5978	85.1011	0.1235	NaN	NaN	NaN	...	1.2878	22.9183	0.4987	0.0181	0.0040	3.6275	0.0117	0.0162	0.0045	137.7844

Semiconductor Manufacturing¶

Project Description¶

Context¶

Content¶

Acknowledgements¶

General Setup¶

Importing the Libraries¶

Importing the Dataset¶

Showing the Dataset in a Table¶

Data Exploration¶

A Quick Review of the Data¶

Feature Set: Center, Spread, and Range¶

Plotting the Raw Data¶

General Corelations¶

Data Preprocessing¶

Seperate The Input and Output¶

Showing the Input Data in a Table format¶

A Quick Check of the Output Data¶

Taking care of missing data¶

Encoding the Dependent Variable¶

Splitting the Dataset into the Training set and Test set¶

Feature Scaling¶

Model Training¶

Required Models¶

Logistic Regression (LR)¶

Artificial Neural Network (ANN)¶

Additional Models¶

Random Forest¶

Gaussian Naive Bayes¶

KNN¶

Model Performance¶

Confusion Matrices¶

Accuracy¶

RMSE¶

Cross Validation¶

Performance Summary¶

Conclusion¶