Skip to content

Commit d24cab7

Browse files
Creating file
1 parent 96d5cd7 commit d24cab7

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed

ensemble-classification-tutorial

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import pandas as pd
2+
import numpy as np
3+
import warnings
4+
5+
from sklearn.preprocessing import LabelEncoder, StandardScaler
6+
from sklearn.metrics import accuracy_score, f1_score, log_loss
7+
from sklearn.model_selection import train_test_split, KFold, cross_val_score
8+
9+
from sklearn.svm import SVC
10+
from sklearn.linear_model import LogisticRegression
11+
from sklearn.tree import DecisionTreeClassifier
12+
from sklearn.ensemble import VotingClassifier
13+
from sklearn.ensemble import BaggingClassifier
14+
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
15+
warnings.filterwarnings('ignore')
16+
17+
training_data = pd.read_csv("train.csv")
18+
testing_data = pd.read_csv("test.csv")
19+
20+
# find out what the numerical features are
21+
train_datatypes = training_data.dtypes
22+
print(train_datatypes)
23+
24+
def get_nulls(training, testing):
25+
print("Training Data:")
26+
print(pd.isnull(training).sum())
27+
print("Testing Data:")
28+
print(pd.isnull(testing).sum())
29+
30+
get_nulls(training_data, testing_data)
31+
32+
# drop the cabin column, as there are too many missing values
33+
# drop the ticket numbers too, as there are too many categories
34+
35+
training_data.drop(labels=['Cabin', 'Ticket'], axis=1, inplace=True)
36+
testing_data.drop(labels=['Cabin', 'Ticket'], axis=1, inplace=True)
37+
38+
# the data is slightly right skewed (young ages have slightly more prominence)
39+
# taking the mean/average value would be impacted by the skew
40+
# so we we should use the median value to impute missing values
41+
42+
training_data["Age"].fillna(training_data["Age"].median(), inplace=True)
43+
testing_data["Age"].fillna(testing_data["Age"].median(), inplace=True)
44+
training_data["Embarked"].fillna("S", inplace=True)
45+
testing_data["Fare"].fillna(testing_data["Fare"].median(), inplace=True)
46+
47+
get_nulls(training_data, testing_data)
48+
49+
# no more missing values
50+
print(training_data.head(10))
51+
print(testing_data.head(10))
52+
53+
54+
encoder_1 = LabelEncoder()
55+
# fit the encoder on the data
56+
encoder_1.fit(training_data["Sex"])
57+
58+
# transform and replace training data
59+
training_sex_encoded = encoder_1.transform(training_data["Sex"])
60+
training_data["Sex"] = training_sex_encoded
61+
test_sex_encoded = encoder_1.transform(testing_data["Sex"])
62+
testing_data["Sex"] = test_sex_encoded
63+
64+
encoder_2 = LabelEncoder()
65+
encoder_2.fit(training_data["Embarked"])
66+
67+
training_embarked_encoded = encoder_2.transform(training_data["Embarked"])
68+
training_data["Embarked"] = training_embarked_encoded
69+
testing_embarked_encoded = encoder_2.transform(testing_data["Embarked"])
70+
testing_data["Embarked"] = testing_embarked_encoded
71+
72+
# drop names
73+
training_data.drop("Name", axis = 1, inplace = True)
74+
testing_data.drop("Name", axis = 1, inplace = True)
75+
76+
# any value we want to reshape needs be turned into array first
77+
ages_train = np.array(training_data["Age"]).reshape(-1, 1)
78+
fares_train = np.array(training_data["Fare"]).reshape(-1, 1)
79+
ages_test = np.array(testing_data["Age"]).reshape(-1, 1)
80+
fares_test = np.array(testing_data["Fare"]).reshape(-1, 1)
81+
82+
# scaler takes arrays
83+
scaler = StandardScaler()
84+
85+
training_data["Age"] = scaler.fit_transform(ages_train)
86+
training_data["Fare"] = scaler.fit_transform(fares_train)
87+
testing_data["Age"] = scaler.fit_transform(ages_test)
88+
testing_data["Fare"] = scaler.fit_transform(fares_test)
89+
90+
# now to select our training/testing data
91+
X_features = training_data.drop(labels=['PassengerId', 'Survived'], axis=1)
92+
y_labels = training_data['Survived']
93+
94+
print(X_features.head(5))
95+
96+
# make the train/test data from validation
97+
98+
X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.1, random_state=27)
99+
100+
LogReg_clf = LogisticRegression()
101+
DTree_clf = DecisionTreeClassifier()
102+
SVC_clf = SVC()
103+
104+
LogReg_clf.fit(X_train, y_train)
105+
DTree_clf.fit(X_train, y_train)
106+
SVC_clf.fit(X_train, y_train)
107+
108+
LogReg_pred = LogReg_clf.predict(X_val)
109+
DTree_pred = DTree_clf.predict(X_val)
110+
SVC_pred = SVC_clf.predict(X_val)
111+
112+
averaged_preds = (LogReg_pred + DTree_pred + SVC_pred)//3
113+
acc = accuracy_score(y_val, averaged_preds)
114+
print(acc)
115+
116+
voting_clf = VotingClassifier(estimators=[('SVC', SVC_clf), ('DTree', DTree_clf), ('LogReg', LogReg_clf)], voting='hard')
117+
voting_clf.fit(X_train, y_train)
118+
preds = voting_clf.predict(X_val)
119+
acc = accuracy_score(y_val, preds)
120+
l_loss = log_loss(y_val, preds)
121+
f1 = f1_score(y_val, preds)
122+
123+
print("Accuracy is: " + str(acc))
124+
print("Log Loss is: " + str(l_loss))
125+
print("F1 Score is: " + str(f1))
126+
127+
logreg_bagging_model = BaggingClassifier(base_estimator=LogReg_clf, n_estimators=50, random_state=12)
128+
dtree_bagging_model = BaggingClassifier(base_estimator=DTree_clf, n_estimators=50, random_state=12)
129+
random_forest = RandomForestClassifier(n_estimators=100, random_state=12)
130+
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=12)
131+
132+
def bagging_ensemble(model):
133+
k_folds = KFold(n_splits=20, random_state=12)
134+
results = cross_val_score(model, X_train, y_train, cv=k_folds)
135+
print(results.mean())
136+
137+
bagging_ensemble(logreg_bagging_model)
138+
bagging_ensemble(dtree_bagging_model)
139+
bagging_ensemble(random_forest)
140+
bagging_ensemble(extra_trees)
141+
142+
k_folds = KFold(n_splits=20, random_state=12)
143+
144+
num_estimators = [20, 40, 60, 80, 100]
145+
146+
for i in num_estimators:
147+
ada_boost = AdaBoostClassifier(n_estimators=i, random_state=12)
148+
results = cross_val_score(ada_boost, X_train, y_train, cv=k_folds)
149+
print("Results for {} estimators:".format(i))
150+
print(results.mean())

0 commit comments

Comments
 (0)