import pandas as pd

df = pd.io.parsers.read_csv(
    filepath_or_buffer='https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv', 
    header=None, 
    sep=',', 
    )
df.tail()

%matplotlib inline

from matplotlib import pyplot as plt
import numpy as np
import math

X = df.values[:,1:] # feature vectors
y = df.values[:,0]  # class labels

fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(12,10))

for ax,cnt in zip(axes.ravel(), range(13)):

    # set bin sizes
    min_b = math.floor(np.min(X[:,cnt]))
    max_b = math.ceil(np.max(X[:,cnt]))
    bins = np.linspace(min_b, max_b, 25)

    # plottling the histograms
    for lab,col in zip(range(1,4), ('blue', 'red', 'green')):
        ax.hist(X[y==lab, cnt],
                   color=col, 
                   label='class %s' %col, 
                   bins=bins,
                   alpha=0.5,)
    ylims = ax.get_ylim()

    # plot annotation
    leg.get_frame().set_alpha(0.5)
    ax.set_ylim([0, max(ylims)+2])
    ax.set_xlabel('feature column %s' %cnt)
    ax.set_title('Wine histogram #%s' %str(cnt+1))

    # hide axis ticks
    ax.tick_params(axis="both", which="both", bottom="off", top="off",  
            labelbottom="on", left="off", right="off", labelleft="on")

    # remove axis spines
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False) 
    ax.spines["bottom"].set_visible(False) 
    ax.spines["left"].set_visible(False)

# set y-axis labels
for ax in axes:
    ax[0].set_ylabel('count')


# hide subplots that are not being used
for i, ax in enumerate(axes.ravel()):

    if i == 12:
        ax.legend(loc='upper right', fancybox=True, fontsize=8)
    if i > 12:
        ax.axis('off')

fig.tight_layout()

plt.show()

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.values[:,1:], df.values[:,0],
    test_size=0.30, random_state=123)

X_train[:,0:1].shape

np.concatenate((X_train[:,0].reshape(X_train.shape[0],1), X_train[:,0].reshape(X_train.shape[0],1)), axis=1).shape

class ColumnExtractor(object):
    
    def __init__(self, cols):
        self.cols = cols
    def transform(self, X):
        col_list = []
        for c in self.cols:
            col_list.append(X[:, c:c+1])
        return np.concatenate(col_list, axis=1)

    def fit(self, X, y=None):
        return self
    
#ColumnExtractor(cols=(1,6)).transform(X_train)

from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.lda import LDA
from sklearn.decomposition import PCA

clf_all = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('reduce_dim', ColumnExtractor(cols=(0,5))),           
    ('classification', GaussianNB())   
    ])

clf_pca = Pipeline(steps=[
    ('scaler', StandardScaler()),    
    ('reduce_dim', PCA(n_components=2)),
    ('classification', GaussianNB())   
    ])

clf_lda = Pipeline(steps=[
    ('scaler', StandardScaler()), 
    ('reduce_dim', LDA(n_components=2)),
    ('classification', GaussianNB())   
    ])

# Constructing the k-fold cross validation iterator (k=10)  

cv = KFold(n=X_train.shape[0],  # total number of samples
           n_folds=10,           # number of folds the dataset is divided into
           shuffle=True,
           random_state=123)

scores = [
    cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
            for clf in [clf_all, clf_pca, clf_lda]
    ]

print('Scores (all samples):', scores[0])
print("Accuracy: {:.2%} (+/- {:.2%})".format(scores[0].mean(), scores[0].std()))

for score,label in zip(scores, 
                       ['all samples', 
                        'PCA dim. red. (n=2)', 
                        'LDA dim. red. (n=2)', 
                        ]
                       ):
    print("Accuracy: {:.2%} (+/- {:.2%}), {:}".format(score.mean(), score.std(), label))

std_scale = StandardScaler().fit(X_train)
X_train = std_scale.transform(X_train)
X_test = std_scale.transform(X_test)

sklearn_lda = LDA(n_components=2).fit(X_train, y_train)
X_train = sklearn_lda.transform(X_train)
X_test = sklearn_lda.transform(X_test)

gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

pred_test = gnb_clf.predict(X_test)

from sklearn import metrics

pred_test = gnb_clf.predict(X_test)

print('Prediction accuracy for the test dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_test, pred_test)))

print('Confusion Matrix of the GNB-classifier')
print(metrics.confusion_matrix(y_test, gnb_clf.predict(X_test)))