|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import warnings |
| 4 | + |
| 5 | +from sklearn.preprocessing import LabelEncoder, StandardScaler |
| 6 | +from sklearn.metrics import accuracy_score, f1_score, log_loss |
| 7 | +from sklearn.model_selection import train_test_split, KFold, cross_val_score |
| 8 | + |
| 9 | +from sklearn.svm import SVC |
| 10 | +from sklearn.linear_model import LogisticRegression |
| 11 | +from sklearn.tree import DecisionTreeClassifier |
| 12 | +from sklearn.ensemble import VotingClassifier |
| 13 | +from sklearn.ensemble import BaggingClassifier |
| 14 | +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier |
| 15 | +warnings.filterwarnings('ignore') |
| 16 | + |
| 17 | +training_data = pd.read_csv("train.csv") |
| 18 | +testing_data = pd.read_csv("test.csv") |
| 19 | + |
| 20 | +# find out what the numerical features are |
| 21 | +train_datatypes = training_data.dtypes |
| 22 | +print(train_datatypes) |
| 23 | + |
| 24 | +def get_nulls(training, testing): |
| 25 | + print("Training Data:") |
| 26 | + print(pd.isnull(training).sum()) |
| 27 | + print("Testing Data:") |
| 28 | + print(pd.isnull(testing).sum()) |
| 29 | + |
| 30 | +get_nulls(training_data, testing_data) |
| 31 | + |
| 32 | +# drop the cabin column, as there are too many missing values |
| 33 | +# drop the ticket numbers too, as there are too many categories |
| 34 | + |
| 35 | +training_data.drop(labels=['Cabin', 'Ticket'], axis=1, inplace=True) |
| 36 | +testing_data.drop(labels=['Cabin', 'Ticket'], axis=1, inplace=True) |
| 37 | + |
| 38 | +# the data is slightly right skewed (young ages have slightly more prominence) |
| 39 | +# taking the mean/average value would be impacted by the skew |
| 40 | +# so we we should use the median value to impute missing values |
| 41 | + |
| 42 | +training_data["Age"].fillna(training_data["Age"].median(), inplace=True) |
| 43 | +testing_data["Age"].fillna(testing_data["Age"].median(), inplace=True) |
| 44 | +training_data["Embarked"].fillna("S", inplace=True) |
| 45 | +testing_data["Fare"].fillna(testing_data["Fare"].median(), inplace=True) |
| 46 | + |
| 47 | +get_nulls(training_data, testing_data) |
| 48 | + |
| 49 | +# no more missing values |
| 50 | +print(training_data.head(10)) |
| 51 | +print(testing_data.head(10)) |
| 52 | + |
| 53 | + |
| 54 | +encoder_1 = LabelEncoder() |
| 55 | +# fit the encoder on the data |
| 56 | +encoder_1.fit(training_data["Sex"]) |
| 57 | + |
| 58 | +# transform and replace training data |
| 59 | +training_sex_encoded = encoder_1.transform(training_data["Sex"]) |
| 60 | +training_data["Sex"] = training_sex_encoded |
| 61 | +test_sex_encoded = encoder_1.transform(testing_data["Sex"]) |
| 62 | +testing_data["Sex"] = test_sex_encoded |
| 63 | + |
| 64 | +encoder_2 = LabelEncoder() |
| 65 | +encoder_2.fit(training_data["Embarked"]) |
| 66 | + |
| 67 | +training_embarked_encoded = encoder_2.transform(training_data["Embarked"]) |
| 68 | +training_data["Embarked"] = training_embarked_encoded |
| 69 | +testing_embarked_encoded = encoder_2.transform(testing_data["Embarked"]) |
| 70 | +testing_data["Embarked"] = testing_embarked_encoded |
| 71 | + |
| 72 | +# drop names |
| 73 | +training_data.drop("Name", axis = 1, inplace = True) |
| 74 | +testing_data.drop("Name", axis = 1, inplace = True) |
| 75 | + |
| 76 | +# any value we want to reshape needs be turned into array first |
| 77 | +ages_train = np.array(training_data["Age"]).reshape(-1, 1) |
| 78 | +fares_train = np.array(training_data["Fare"]).reshape(-1, 1) |
| 79 | +ages_test = np.array(testing_data["Age"]).reshape(-1, 1) |
| 80 | +fares_test = np.array(testing_data["Fare"]).reshape(-1, 1) |
| 81 | + |
| 82 | +# scaler takes arrays |
| 83 | +scaler = StandardScaler() |
| 84 | + |
| 85 | +training_data["Age"] = scaler.fit_transform(ages_train) |
| 86 | +training_data["Fare"] = scaler.fit_transform(fares_train) |
| 87 | +testing_data["Age"] = scaler.fit_transform(ages_test) |
| 88 | +testing_data["Fare"] = scaler.fit_transform(fares_test) |
| 89 | + |
| 90 | +# now to select our training/testing data |
| 91 | +X_features = training_data.drop(labels=['PassengerId', 'Survived'], axis=1) |
| 92 | +y_labels = training_data['Survived'] |
| 93 | + |
| 94 | +print(X_features.head(5)) |
| 95 | + |
| 96 | +# make the train/test data from validation |
| 97 | + |
| 98 | +X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.1, random_state=27) |
| 99 | + |
| 100 | +LogReg_clf = LogisticRegression() |
| 101 | +DTree_clf = DecisionTreeClassifier() |
| 102 | +SVC_clf = SVC() |
| 103 | + |
| 104 | +LogReg_clf.fit(X_train, y_train) |
| 105 | +DTree_clf.fit(X_train, y_train) |
| 106 | +SVC_clf.fit(X_train, y_train) |
| 107 | + |
| 108 | +LogReg_pred = LogReg_clf.predict(X_val) |
| 109 | +DTree_pred = DTree_clf.predict(X_val) |
| 110 | +SVC_pred = SVC_clf.predict(X_val) |
| 111 | + |
| 112 | +averaged_preds = (LogReg_pred + DTree_pred + SVC_pred)//3 |
| 113 | +acc = accuracy_score(y_val, averaged_preds) |
| 114 | +print(acc) |
| 115 | + |
| 116 | +voting_clf = VotingClassifier(estimators=[('SVC', SVC_clf), ('DTree', DTree_clf), ('LogReg', LogReg_clf)], voting='hard') |
| 117 | +voting_clf.fit(X_train, y_train) |
| 118 | +preds = voting_clf.predict(X_val) |
| 119 | +acc = accuracy_score(y_val, preds) |
| 120 | +l_loss = log_loss(y_val, preds) |
| 121 | +f1 = f1_score(y_val, preds) |
| 122 | + |
| 123 | +print("Accuracy is: " + str(acc)) |
| 124 | +print("Log Loss is: " + str(l_loss)) |
| 125 | +print("F1 Score is: " + str(f1)) |
| 126 | + |
| 127 | +logreg_bagging_model = BaggingClassifier(base_estimator=LogReg_clf, n_estimators=50, random_state=12) |
| 128 | +dtree_bagging_model = BaggingClassifier(base_estimator=DTree_clf, n_estimators=50, random_state=12) |
| 129 | +random_forest = RandomForestClassifier(n_estimators=100, random_state=12) |
| 130 | +extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=12) |
| 131 | + |
| 132 | +def bagging_ensemble(model): |
| 133 | + k_folds = KFold(n_splits=20, random_state=12) |
| 134 | + results = cross_val_score(model, X_train, y_train, cv=k_folds) |
| 135 | + print(results.mean()) |
| 136 | + |
| 137 | +bagging_ensemble(logreg_bagging_model) |
| 138 | +bagging_ensemble(dtree_bagging_model) |
| 139 | +bagging_ensemble(random_forest) |
| 140 | +bagging_ensemble(extra_trees) |
| 141 | + |
| 142 | +k_folds = KFold(n_splits=20, random_state=12) |
| 143 | + |
| 144 | +num_estimators = [20, 40, 60, 80, 100] |
| 145 | + |
| 146 | +for i in num_estimators: |
| 147 | + ada_boost = AdaBoostClassifier(n_estimators=i, random_state=12) |
| 148 | + results = cross_val_score(ada_boost, X_train, y_train, cv=k_folds) |
| 149 | + print("Results for {} estimators:".format(i)) |
| 150 | + print(results.mean()) |
0 commit comments