import pandas as pd df = pd.io.parsers.read_csv( filepath_or_buffer='https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv', header=None, sep=',', ) df.tail() %matplotlib inline from matplotlib import pyplot as plt import numpy as np import math X = df.values[:,1:] # feature vectors y = df.values[:,0] # class labels fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(12,10)) for ax,cnt in zip(axes.ravel(), range(13)): # set bin sizes min_b = math.floor(np.min(X[:,cnt])) max_b = math.ceil(np.max(X[:,cnt])) bins = np.linspace(min_b, max_b, 25) # plottling the histograms for lab,col in zip(range(1,4), ('blue', 'red', 'green')): ax.hist(X[y==lab, cnt], color=col, label='class %s' %col, bins=bins, alpha=0.5,) ylims = ax.get_ylim() # plot annotation leg.get_frame().set_alpha(0.5) ax.set_ylim([0, max(ylims)+2]) ax.set_xlabel('feature column %s' %cnt) ax.set_title('Wine histogram #%s' %str(cnt+1)) # hide axis ticks ax.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on") # remove axis spines ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) # set y-axis labels for ax in axes: ax[0].set_ylabel('count') # hide subplots that are not being used for i, ax in enumerate(axes.ravel()): if i == 12: ax.legend(loc='upper right', fancybox=True, fontsize=8) if i > 12: ax.axis('off') fig.tight_layout() plt.show() from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(df.values[:,1:], df.values[:,0], test_size=0.30, random_state=123) X_train[:,0:1].shape np.concatenate((X_train[:,0].reshape(X_train.shape[0],1), X_train[:,0].reshape(X_train.shape[0],1)), axis=1).shape class ColumnExtractor(object): def __init__(self, cols): self.cols = cols def transform(self, X): col_list = [] for c in self.cols: col_list.append(X[:, c:c+1]) return np.concatenate(col_list, axis=1) def fit(self, X, y=None): return self #ColumnExtractor(cols=(1,6)).transform(X_train) from sklearn.cross_validation import cross_val_score, KFold from sklearn.pipeline import Pipeline from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler from sklearn.lda import LDA from sklearn.decomposition import PCA clf_all = Pipeline(steps=[ ('scaler', StandardScaler()), ('reduce_dim', ColumnExtractor(cols=(0,5))), ('classification', GaussianNB()) ]) clf_pca = Pipeline(steps=[ ('scaler', StandardScaler()), ('reduce_dim', PCA(n_components=2)), ('classification', GaussianNB()) ]) clf_lda = Pipeline(steps=[ ('scaler', StandardScaler()), ('reduce_dim', LDA(n_components=2)), ('classification', GaussianNB()) ]) # Constructing the k-fold cross validation iterator (k=10) cv = KFold(n=X_train.shape[0], # total number of samples n_folds=10, # number of folds the dataset is divided into shuffle=True, random_state=123) scores = [ cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy') for clf in [clf_all, clf_pca, clf_lda] ] print('Scores (all samples):', scores[0]) print("Accuracy: {:.2%} (+/- {:.2%})".format(scores[0].mean(), scores[0].std())) for score,label in zip(scores, ['all samples', 'PCA dim. red. (n=2)', 'LDA dim. red. (n=2)', ] ): print("Accuracy: {:.2%} (+/- {:.2%}), {:}".format(score.mean(), score.std(), label)) std_scale = StandardScaler().fit(X_train) X_train = std_scale.transform(X_train) X_test = std_scale.transform(X_test) sklearn_lda = LDA(n_components=2).fit(X_train, y_train) X_train = sklearn_lda.transform(X_train) X_test = sklearn_lda.transform(X_test) gnb_clf = GaussianNB() gnb_clf.fit(X_train, y_train) pred_test = gnb_clf.predict(X_test) from sklearn import metrics pred_test = gnb_clf.predict(X_test) print('Prediction accuracy for the test dataset') print('{:.2%}'.format(metrics.accuracy_score(y_test, pred_test))) print('Confusion Matrix of the GNB-classifier') print(metrics.confusion_matrix(y_test, gnb_clf.predict(X_test)))