#!/usr/bin/env python # coding: utf-8 # ## Lets play with Reuters collection in NLTK # # # In[2]: from nltk.corpus import reuters # List of document ids documents = reuters.fileids() print("Documents: {}".format(len(documents))) # Train documents train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) print("Total train documents: {}".format(len(train_docs_id))) # Test documents test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents)) print("Total test documents: {}".format(len(test_docs_id))) # In[6]: # Let's get a document with multiple labels doc = 'training/9865' print(reuters.raw(doc)) # In[7]: print(reuters.categories(doc)) # In[10]: from operator import itemgetter from pprint import pprint # List categories categories = reuters.categories() print("Number of categories: ", len(categories)) # In[15]: # Document per category category_dist = [(category, len(reuters.fileids(category))) for category in categories] category_dist = sorted(category_dist, key=itemgetter(1), reverse=True) print("Most common categories: ") pprint(category_dist[-5:]) # In[17]: from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import MultiLabelBinarizer from sklearn.svm import LinearSVC from sklearn.multiclass import OneVsRestClassifier stop_words = stopwords.words("english") train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) test_doc_id = list(filter(lambda doc: doc.startswith("test"), documents)) train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id] test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] # Tokenize vectorizer = TfidfVectorizer(stop_words = stop_words) # Learn and transform train documents vectorized_train_docs = vectorizer.fit_transform(train_docs) vectorized_test_docs = vectorizer.transform(test_docs) # Transform multi-labels labels multilabelbin = MultiLabelBinarizer() train_labels = multilabelbin.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id]) test_labels = multilabelbin.transform([reuters.categories(doc_id) for doc_id in test_docs_id]) # Classification classifier = OneVsRestClassifier(LinearSVC(random_state=52)) #why this random state? classifier.fit(vectorized_train_docs, train_labels) # Predict predictions = classifier.predict(vectorized_test_docs) # Print print("Number of labels assigned: {}".format(sum([sum(prediction) for prediction in predictions]))) # In[25]: # Lets check ou some metrics from sklearn.metrics import f1_score, precision_score, recall_score # How's the quality? precision = precision_score(test_labels, predictions, average='micro') recall = recall_score(test_labels, predictions, average='micro') f1 = f1_score(test_labels, predictions, average='micro') print("Micro average quality metrics") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(test_labels, predictions, average='macro') recall = recall_score(test_labels, predictions, average='macro') f1 = f1_score(test_labels, predictions, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) # # ### More fun facts about f1_score etc. # In[21]: from sklearn.metrics import f1_score y_true = [0, 1, 2, 0, 1, 2] y_pred = [0, 2, 1, 0, 0, 1] f1_score(y_true, y_pred, average='macro') f1_score(y_true, y_pred, average='micro') f1_score(y_true, y_pred, average='weighted') f1_score(y_true, y_pred, average=None) # In[24]: help(f1_score) # In[ ]: