{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lets play with Reuters collection in NLTK\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Documents: 10788\n",
      "Total train documents: 7769\n",
      "Total test documents: 3019\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import reuters\n",
    "\n",
    "# List of document ids\n",
    "documents = reuters.fileids()\n",
    "print(\"Documents: {}\".format(len(documents)))\n",
    "\n",
    "# Train documents\n",
    "train_docs_id = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n",
    "print(\"Total train documents: {}\".format(len(train_docs_id)))\n",
    "\n",
    "# Test documents\n",
    "test_docs_id = list(filter(lambda doc: doc.startswith(\"test\"), documents))\n",
    "print(\"Total test documents: {}\".format(len(test_docs_id)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FRENCH FREE MARKET CEREAL EXPORT BIDS DETAILED\n",
      "  French operators have requested licences\n",
      "  to export 675,500 tonnes of maize, 245,000 tonnes of barley,\n",
      "  22,000 tonnes of soft bread wheat and 20,000 tonnes of feed\n",
      "  wheat at today's European Community tender, traders said.\n",
      "      Rebates requested ranged from 127.75 to 132.50 European\n",
      "  Currency Units a tonne for maize, 136.00 to 141.00 Ecus a tonne\n",
      "  for barley and 134.25 to 141.81 Ecus for bread wheat, while\n",
      "  rebates requested for feed wheat were 137.65 Ecus, they said.\n",
      "  \n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Let's get a document with multiple labels\n",
    "doc = 'training/9865'\n",
    "print(reuters.raw(doc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['barley', 'corn', 'grain', 'wheat']\n"
     ]
    }
   ],
   "source": [
    "print(reuters.categories(doc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of categories:  90\n"
     ]
    }
   ],
   "source": [
    "from operator import itemgetter\n",
    "from pprint import pprint\n",
    "\n",
    "# List categories\n",
    "categories = reuters.categories()\n",
    "print(\"Number of categories: \", len(categories))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most common categories: \n",
      "[('castor-oil', 2),\n",
      " ('groundnut-oil', 2),\n",
      " ('lin-oil', 2),\n",
      " ('rye', 2),\n",
      " ('sun-meal', 2)]\n"
     ]
    }
   ],
   "source": [
    "# Document per category\n",
    "category_dist = [(category, len(reuters.fileids(category))) for category in categories]\n",
    "category_dist = sorted(category_dist, key=itemgetter(1), reverse=True)\n",
    "\n",
    "print(\"Most common categories: \")\n",
    "pprint(category_dist[-5:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of labels assigned: 3126\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import stopwords\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "\n",
    "stop_words = stopwords.words(\"english\")\n",
    "\n",
    "train_docs_id = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n",
    "test_doc_id = list(filter(lambda doc: doc.startswith(\"test\"), documents))\n",
    "\n",
    "train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]\n",
    "test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]\n",
    "\n",
    "# Tokenize\n",
    "vectorizer = TfidfVectorizer(stop_words = stop_words)\n",
    "\n",
    "# Learn and transform train documents\n",
    "vectorized_train_docs = vectorizer.fit_transform(train_docs)\n",
    "vectorized_test_docs = vectorizer.transform(test_docs)\n",
    "\n",
    "# Transform multi-labels labels\n",
    "multilabelbin = MultiLabelBinarizer()\n",
    "train_labels = multilabelbin.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])\n",
    "test_labels = multilabelbin.transform([reuters.categories(doc_id) for doc_id in test_docs_id])\n",
    "\n",
    "# Classification\n",
    "classifier = OneVsRestClassifier(LinearSVC(random_state=52)) #why this random state?\n",
    "classifier.fit(vectorized_train_docs, train_labels)\n",
    "\n",
    "# Predict\n",
    "predictions = classifier.predict(vectorized_test_docs)\n",
    "\n",
    "# Print\n",
    "print(\"Number of labels assigned: {}\".format(sum([sum(prediction) for prediction in predictions])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Micro average quality metrics\n",
      "Precision: 0.9517, Recall: 0.7946, F1-measure: 0.8661\n",
      "Macro-average quality numbers\n",
      "Precision: 0.6305, Recall: 0.3715, F1-measure: 0.4451\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tarrysingh/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/tarrysingh/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "# Lets check ou some metrics\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score\n",
    "\n",
    "# How's the quality?\n",
    "precision = precision_score(test_labels, predictions, average='micro')\n",
    "recall = recall_score(test_labels, predictions, average='micro') \n",
    "f1 = f1_score(test_labels, predictions, average='micro')\n",
    "print(\"Micro average quality metrics\")\n",
    "print(\"Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}\".format(precision, \n",
    "                                                                     recall, \n",
    "                                                                     f1))\n",
    "\n",
    "precision = precision_score(test_labels, predictions, average='macro')\n",
    "recall = recall_score(test_labels, predictions, average='macro')\n",
    "f1 = f1_score(test_labels, predictions, average='macro')\n",
    "print(\"Macro-average quality numbers\")\n",
    "print(\"Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}\".format(precision, \n",
    "                                                                     recall, \n",
    "                                                                     f1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### More fun facts about f1_score etc. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.8,  0. ,  0. ])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "y_true = [0, 1, 2, 0, 1, 2]\n",
    "y_pred = [0, 2, 1, 0, 0, 1]\n",
    "f1_score(y_true, y_pred, average='macro')  \n",
    "\n",
    "f1_score(y_true, y_pred, average='micro')  \n",
    "\n",
    "f1_score(y_true, y_pred, average='weighted')  \n",
    "\n",
    "f1_score(y_true, y_pred, average=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function f1_score in module sklearn.metrics.classification:\n",
      "\n",
      "f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None)\n",
      "    Compute the F1 score, also known as balanced F-score or F-measure\n",
      "    \n",
      "    The F1 score can be interpreted as a weighted average of the precision and\n",
      "    recall, where an F1 score reaches its best value at 1 and worst score at 0.\n",
      "    The relative contribution of precision and recall to the F1 score are\n",
      "    equal. The formula for the F1 score is::\n",
      "    \n",
      "        F1 = 2 * (precision * recall) / (precision + recall)\n",
      "    \n",
      "    In the multi-class and multi-label case, this is the weighted average of\n",
      "    the F1 score of each class.\n",
      "    \n",
      "    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.\n",
      "    \n",
      "    Parameters\n",
      "    ----------\n",
      "    y_true : 1d array-like, or label indicator array / sparse matrix\n",
      "        Ground truth (correct) target values.\n",
      "    \n",
      "    y_pred : 1d array-like, or label indicator array / sparse matrix\n",
      "        Estimated targets as returned by a classifier.\n",
      "    \n",
      "    labels : list, optional\n",
      "        The set of labels to include when ``average != 'binary'``, and their\n",
      "        order if ``average is None``. Labels present in the data can be\n",
      "        excluded, for example to calculate a multiclass average ignoring a\n",
      "        majority negative class, while labels not present in the data will\n",
      "        result in 0 components in a macro average. For multilabel targets,\n",
      "        labels are column indices. By default, all labels in ``y_true`` and\n",
      "        ``y_pred`` are used in sorted order.\n",
      "    \n",
      "        .. versionchanged:: 0.17\n",
      "           parameter *labels* improved for multiclass problem.\n",
      "    \n",
      "    pos_label : str or int, 1 by default\n",
      "        The class to report if ``average='binary'`` and the data is binary.\n",
      "        If the data are multiclass or multilabel, this will be ignored;\n",
      "        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n",
      "        scores for that label only.\n",
      "    \n",
      "    average : string, [None, 'binary' (default), 'micro', 'macro', 'samples',                        'weighted']\n",
      "        This parameter is required for multiclass/multilabel targets.\n",
      "        If ``None``, the scores for each class are returned. Otherwise, this\n",
      "        determines the type of averaging performed on the data:\n",
      "    \n",
      "        ``'binary'``:\n",
      "            Only report results for the class specified by ``pos_label``.\n",
      "            This is applicable only if targets (``y_{true,pred}``) are binary.\n",
      "        ``'micro'``:\n",
      "            Calculate metrics globally by counting the total true positives,\n",
      "            false negatives and false positives.\n",
      "        ``'macro'``:\n",
      "            Calculate metrics for each label, and find their unweighted\n",
      "            mean.  This does not take label imbalance into account.\n",
      "        ``'weighted'``:\n",
      "            Calculate metrics for each label, and find their average, weighted\n",
      "            by support (the number of true instances for each label). This\n",
      "            alters 'macro' to account for label imbalance; it can result in an\n",
      "            F-score that is not between precision and recall.\n",
      "        ``'samples'``:\n",
      "            Calculate metrics for each instance, and find their average (only\n",
      "            meaningful for multilabel classification where this differs from\n",
      "            :func:`accuracy_score`).\n",
      "    \n",
      "    sample_weight : array-like of shape = [n_samples], optional\n",
      "        Sample weights.\n",
      "    \n",
      "    Returns\n",
      "    -------\n",
      "    f1_score : float or array of float, shape = [n_unique_labels]\n",
      "        F1 score of the positive class in binary classification or weighted\n",
      "        average of the F1 scores of each class for the multiclass task.\n",
      "    \n",
      "    References\n",
      "    ----------\n",
      "    .. [1] `Wikipedia entry for the F1-score\n",
      "           <https://en.wikipedia.org/wiki/F1_score>`_\n",
      "    \n",
      "    Examples\n",
      "    --------\n",
      "    >>> from sklearn.metrics import f1_score\n",
      "    >>> y_true = [0, 1, 2, 0, 1, 2]\n",
      "    >>> y_pred = [0, 2, 1, 0, 0, 1]\n",
      "    >>> f1_score(y_true, y_pred, average='macro')  # doctest: +ELLIPSIS\n",
      "    0.26...\n",
      "    >>> f1_score(y_true, y_pred, average='micro')  # doctest: +ELLIPSIS\n",
      "    0.33...\n",
      "    >>> f1_score(y_true, y_pred, average='weighted')  # doctest: +ELLIPSIS\n",
      "    0.26...\n",
      "    >>> f1_score(y_true, y_pred, average=None)\n",
      "    array([ 0.8,  0. ,  0. ])\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(f1_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}