#!/usr/bin/env python # coding: utf-8 # [Sebastian Raschka](http://www.sebastianraschka.com) # # [back](https://github.com/rasbt/matplotlib-gallery) to the `matplotlib-gallery` at [https://github.com/rasbt/matplotlib-gallery](https://github.com/rasbt/matplotlib-gallery) # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') # In[2]: get_ipython().run_line_magic('watermark', '-u -v -d -p matplotlib,numpy') # [More info](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `%watermark` extension # In[3]: get_ipython().run_line_magic('matplotlib', 'inline') #
#
# # Histograms in matplotlib # # Sections # - [Simple histograms](#Simple-histograms) # # - [Fixed bin size](#Fixed-bin-size) # # - [Fixed number of bins](#Fixed-number-of-bins) # # - [Histogram of 2 overlapping data sets](#Histogram-of-2-overlapping-data-sets) # # - [Histogram showing bar heights but without area under the bars](#Histogram-showing-bar-heights-but-without-area-under-the-bars) #
#
#
#
# # Simple histograms # [[back to top](#Sections)] #
#
# ### Fixed bin size # [[back to top](#Sections)] # In[26]: import numpy as np import random from matplotlib import pyplot as plt data = np.random.normal(0, 20, 1000) # fixed bin size bins = np.arange(-100, 100, 5) # fixed bin size plt.xlim([min(data)-5, max(data)+5]) plt.hist(data, bins=bins, alpha=0.5) plt.title('Random Gaussian data (fixed bin size)') plt.xlabel('variable X (bin size = 5)') plt.ylabel('count') plt.show() #
#
# ### Fixed number of bins # [[back to top](#Sections)] # In[30]: import numpy as np import random import math from matplotlib import pyplot as plt data = np.random.normal(0, 20, 1000) bins = np.linspace(math.ceil(min(data)), math.floor(max(data)), 20) # fixed number of bins plt.xlim([min(data)-5, max(data)+5]) plt.hist(data, bins=bins, alpha=0.5) plt.title('Random Gaussian data (fixed number of bins)') plt.xlabel('variable X (20 evenly spaced bins)') plt.ylabel('count') plt.show() #
#
# # Histogram of 2 overlapping data sets # [[back to top](#Sections)] # In[4]: import numpy as np import random from matplotlib import pyplot as plt data1 = [random.gauss(15,10) for i in range(500)] data2 = [random.gauss(5,5) for i in range(500)] bins = np.arange(-60, 60, 2.5) plt.xlim([min(data1+data2)-5, max(data1+data2)+5]) plt.hist(data1, bins=bins, alpha=0.3, label='class 1') plt.hist(data2, bins=bins, alpha=0.3, label='class 2') plt.title('Random Gaussian data') plt.xlabel('variable X') plt.ylabel('count') plt.legend(loc='upper right') plt.show() # In[32]: smooth = interp1d(bins, y, kind='cubic') # In[33]: smooth # In[35]: import numpy as np import random import math from matplotlib import pyplot as plt import matplotlib.mlab as mlab from scipy.stats import norm from scipy.interpolate import interp1d data = np.random.normal(0, 20, 10000) # plotting the histogram n, bins, patches = plt.hist(data, bins=20, normed=1, alpha=0.5, color='lightblue') # fitting the data mu, sigma = norm.fit(data) # adding the fitted line y = mlab.normpdf(bins, mu, sigma) interp = interp1d(bins, y, kind='cubic') plt.plot(bins, interp(y), linewidth=2, color='blue') plt.xlim([min(data)-5, max(data)+5]) plt.title('Random Gaussian data (fixed number of bins)') plt.xlabel('variable X (20 evenly spaced bins)') plt.ylabel('count') plt.show() #
#
# # Histogram showing bar heights but without area under the bars # [[back to top](#Sections)] # The line plot below is using bins of a histogram and is particularly useful if you are working with many different overlapping data sets. # In[33]: # Generate a random Gaussian dataset with different means # 5 rows with 30 columns, where every row represents 1 sample. import numpy as np data = np.ones((5,30)) for i in range(5): data[i,:] = np.random.normal(loc=i/2, scale=1.0, size=30) # Via the `numpy.histogram` function, we can categorize our data into distinct bins. # In[34]: from math import floor, ceil # for rounding up and down data_min = floor(data.min()) # minimum val. of the dataset rounded down data_max = floor(data.max()) # maximum val. of the dataset rounded up bins_size = 0.5 bins = np.arange(floor(data_min), ceil(data_max), bin_size) np.histogram(data[0,:], bins=bins) # The [`numpy.histogram`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html) function returns a tuple, where the first value is an array of how many samples fall into the first bin, the second bin, and so forth. # The second value is another NumPy array; it contains the specified bins. Note that all bins but the last one are half open intervals, e.g., the first bin would be `[-2, -1.5)` (including -2, but not including -1.5), and the second bin would be `[-1.5, -1.)` (including -1.5, but not including 1.0). But the last bin is defined as `[2., 2.5]` (including 2 and including 2.5). # In[57]: from matplotlib import pyplot as plt markers = ['^', 'v', 'o', 'p', 'x', 's', 'p', ','] plt.figure(figsize=(13,8)) for row in range(data.shape[0]): hist = np.histogram(data[row,:], bins=bins) plt.errorbar(hist[1][:-1] + bin_size/2, hist[0], alpha=0.3, xerr=bin_size/2, capsize=0, fmt=None, linewidth=8, ) plt.legend(['sample %s'%i for i in range(1, 6)]) plt.grid() plt.title('Histogram showing bar heights but without area under the bars', fontsize=18) plt.ylabel('count', fontsize=14) plt.xlabel('X value (bin size = %s)'%bin_size, fontsize=14) plt.xticks(bins + bin_size) plt.show() # In[ ]: