{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Lets play with Reuters collection in NLTK\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Documents: 10788\n", "Total train documents: 7769\n", "Total test documents: 3019\n" ] } ], "source": [ "from nltk.corpus import reuters\n", "\n", "# List of document ids\n", "documents = reuters.fileids()\n", "print(\"Documents: {}\".format(len(documents)))\n", "\n", "# Train documents\n", "train_docs_id = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n", "print(\"Total train documents: {}\".format(len(train_docs_id)))\n", "\n", "# Test documents\n", "test_docs_id = list(filter(lambda doc: doc.startswith(\"test\"), documents))\n", "print(\"Total test documents: {}\".format(len(test_docs_id)))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FRENCH FREE MARKET CEREAL EXPORT BIDS DETAILED\n", " French operators have requested licences\n", " to export 675,500 tonnes of maize, 245,000 tonnes of barley,\n", " 22,000 tonnes of soft bread wheat and 20,000 tonnes of feed\n", " wheat at today's European Community tender, traders said.\n", " Rebates requested ranged from 127.75 to 132.50 European\n", " Currency Units a tonne for maize, 136.00 to 141.00 Ecus a tonne\n", " for barley and 134.25 to 141.81 Ecus for bread wheat, while\n", " rebates requested for feed wheat were 137.65 Ecus, they said.\n", " \n", "\n", "\n" ] } ], "source": [ "# Let's get a document with multiple labels\n", "doc = 'training/9865'\n", "print(reuters.raw(doc))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['barley', 'corn', 'grain', 'wheat']\n" ] } ], "source": [ "print(reuters.categories(doc))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of categories: 90\n" ] } ], "source": [ "from operator import itemgetter\n", "from pprint import pprint\n", "\n", "# List categories\n", "categories = reuters.categories()\n", "print(\"Number of categories: \", len(categories))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Most common categories: \n", "[('castor-oil', 2),\n", " ('groundnut-oil', 2),\n", " ('lin-oil', 2),\n", " ('rye', 2),\n", " ('sun-meal', 2)]\n" ] } ], "source": [ "# Document per category\n", "category_dist = [(category, len(reuters.fileids(category))) for category in categories]\n", "category_dist = sorted(category_dist, key=itemgetter(1), reverse=True)\n", "\n", "print(\"Most common categories: \")\n", "pprint(category_dist[-5:])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of labels assigned: 3126\n" ] } ], "source": [ "from nltk.corpus import stopwords\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.svm import LinearSVC\n", "from sklearn.multiclass import OneVsRestClassifier\n", "\n", "stop_words = stopwords.words(\"english\")\n", "\n", "train_docs_id = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n", "test_doc_id = list(filter(lambda doc: doc.startswith(\"test\"), documents))\n", "\n", "train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]\n", "test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]\n", "\n", "# Tokenize\n", "vectorizer = TfidfVectorizer(stop_words = stop_words)\n", "\n", "# Learn and transform train documents\n", "vectorized_train_docs = vectorizer.fit_transform(train_docs)\n", "vectorized_test_docs = vectorizer.transform(test_docs)\n", "\n", "# Transform multi-labels labels\n", "multilabelbin = MultiLabelBinarizer()\n", "train_labels = multilabelbin.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])\n", "test_labels = multilabelbin.transform([reuters.categories(doc_id) for doc_id in test_docs_id])\n", "\n", "# Classification\n", "classifier = OneVsRestClassifier(LinearSVC(random_state=52)) #why this random state?\n", "classifier.fit(vectorized_train_docs, train_labels)\n", "\n", "# Predict\n", "predictions = classifier.predict(vectorized_test_docs)\n", "\n", "# Print\n", "print(\"Number of labels assigned: {}\".format(sum([sum(prediction) for prediction in predictions])))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Micro average quality metrics\n", "Precision: 0.9517, Recall: 0.7946, F1-measure: 0.8661\n", "Macro-average quality numbers\n", "Precision: 0.6305, Recall: 0.3715, F1-measure: 0.4451\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/tarrysingh/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n", "/Users/tarrysingh/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n" ] } ], "source": [ "# Lets check ou some metrics\n", "from sklearn.metrics import f1_score, precision_score, recall_score\n", "\n", "# How's the quality?\n", "precision = precision_score(test_labels, predictions, average='micro')\n", "recall = recall_score(test_labels, predictions, average='micro') \n", "f1 = f1_score(test_labels, predictions, average='micro')\n", "print(\"Micro average quality metrics\")\n", "print(\"Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}\".format(precision, \n", " recall, \n", " f1))\n", "\n", "precision = precision_score(test_labels, predictions, average='macro')\n", "recall = recall_score(test_labels, predictions, average='macro')\n", "f1 = f1_score(test_labels, predictions, average='macro')\n", "print(\"Macro-average quality numbers\")\n", "print(\"Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}\".format(precision, \n", " recall, \n", " f1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### More fun facts about f1_score etc. " ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.8, 0. , 0. ])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import f1_score\n", "y_true = [0, 1, 2, 0, 1, 2]\n", "y_pred = [0, 2, 1, 0, 0, 1]\n", "f1_score(y_true, y_pred, average='macro') \n", "\n", "f1_score(y_true, y_pred, average='micro') \n", "\n", "f1_score(y_true, y_pred, average='weighted') \n", "\n", "f1_score(y_true, y_pred, average=None)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function f1_score in module sklearn.metrics.classification:\n", "\n", "f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None)\n", " Compute the F1 score, also known as balanced F-score or F-measure\n", " \n", " The F1 score can be interpreted as a weighted average of the precision and\n", " recall, where an F1 score reaches its best value at 1 and worst score at 0.\n", " The relative contribution of precision and recall to the F1 score are\n", " equal. The formula for the F1 score is::\n", " \n", " F1 = 2 * (precision * recall) / (precision + recall)\n", " \n", " In the multi-class and multi-label case, this is the weighted average of\n", " the F1 score of each class.\n", " \n", " Read more in the :ref:`User Guide `.\n", " \n", " Parameters\n", " ----------\n", " y_true : 1d array-like, or label indicator array / sparse matrix\n", " Ground truth (correct) target values.\n", " \n", " y_pred : 1d array-like, or label indicator array / sparse matrix\n", " Estimated targets as returned by a classifier.\n", " \n", " labels : list, optional\n", " The set of labels to include when ``average != 'binary'``, and their\n", " order if ``average is None``. Labels present in the data can be\n", " excluded, for example to calculate a multiclass average ignoring a\n", " majority negative class, while labels not present in the data will\n", " result in 0 components in a macro average. For multilabel targets,\n", " labels are column indices. By default, all labels in ``y_true`` and\n", " ``y_pred`` are used in sorted order.\n", " \n", " .. versionchanged:: 0.17\n", " parameter *labels* improved for multiclass problem.\n", " \n", " pos_label : str or int, 1 by default\n", " The class to report if ``average='binary'`` and the data is binary.\n", " If the data are multiclass or multilabel, this will be ignored;\n", " setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n", " scores for that label only.\n", " \n", " average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', 'weighted']\n", " This parameter is required for multiclass/multilabel targets.\n", " If ``None``, the scores for each class are returned. Otherwise, this\n", " determines the type of averaging performed on the data:\n", " \n", " ``'binary'``:\n", " Only report results for the class specified by ``pos_label``.\n", " This is applicable only if targets (``y_{true,pred}``) are binary.\n", " ``'micro'``:\n", " Calculate metrics globally by counting the total true positives,\n", " false negatives and false positives.\n", " ``'macro'``:\n", " Calculate metrics for each label, and find their unweighted\n", " mean. This does not take label imbalance into account.\n", " ``'weighted'``:\n", " Calculate metrics for each label, and find their average, weighted\n", " by support (the number of true instances for each label). This\n", " alters 'macro' to account for label imbalance; it can result in an\n", " F-score that is not between precision and recall.\n", " ``'samples'``:\n", " Calculate metrics for each instance, and find their average (only\n", " meaningful for multilabel classification where this differs from\n", " :func:`accuracy_score`).\n", " \n", " sample_weight : array-like of shape = [n_samples], optional\n", " Sample weights.\n", " \n", " Returns\n", " -------\n", " f1_score : float or array of float, shape = [n_unique_labels]\n", " F1 score of the positive class in binary classification or weighted\n", " average of the F1 scores of each class for the multiclass task.\n", " \n", " References\n", " ----------\n", " .. [1] `Wikipedia entry for the F1-score\n", " `_\n", " \n", " Examples\n", " --------\n", " >>> from sklearn.metrics import f1_score\n", " >>> y_true = [0, 1, 2, 0, 1, 2]\n", " >>> y_pred = [0, 2, 1, 0, 0, 1]\n", " >>> f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS\n", " 0.26...\n", " >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS\n", " 0.33...\n", " >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS\n", " 0.26...\n", " >>> f1_score(y_true, y_pred, average=None)\n", " array([ 0.8, 0. , 0. ])\n", "\n" ] } ], "source": [ "help(f1_score)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }