#!/usr/bin/env python
# coding: utf-8

# # Autograd
# Autograd can automatically differentiate native Python and Numpy code. It can handle a large subset of Python's features, including loops, ifs, recursion and closures, and it can even take derivatives of derivatives of derivatives. It uses reverse-mode differentiation (a.k.a. backpropagation), which means it can efficiently take gradients of scalar-valued functions with respect to array-valued arguments. There's also a forward-mode extension, which lets you arbitrarily mix forward- and reverse-mode accumulation. The main intended application of autograd is gradient-based optimization.

# In[24]:


# Use pip install autograd OR conda install autograd (if you have Anaconda)
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[25]:


# Define a simple tan function
def tanh(x):
    y = np.exp(-x)
    return (1.0 - y) / (1.0 + y)
grad_tanh = grad(tanh) # get its gradient function


# In[26]:


#Calculate gradient when x = 1.0
grad_tanh(1.0)


# In[27]:


# Compare to finite differences
(tanh(1.0001) - tanh(0.9999)) / 0.0002


# #### Differentiate as many time as you like
# We can continue to differentiate as many times as we like, and use numpy's broadcasting of scalar-valued functions across many different input values:

# In[ ]:


x = np.linspace(-7, 7, 200)
plt.plot(x, tanh(x),
         x, grad(tanh)(x),                                 # first derivative
         x, grad(grad(tanh))(x),                           # second derivative
         x, grad(grad(grad(tanh)))(x),                     # third derivative
         x, grad(grad(grad(grad(tanh))))(x),               # fourth derivative
         x, grad(grad(grad(grad(grad(tanh)))))(x),         # fifth derivative
         x, grad(grad(grad(grad(grad(grad(tanh))))))(x))   # sixth derivative

plt.axis('off')
plt.savefig("tanh.png")
plt.show()


# ## Another simpler example

# In[37]:


def taylor_sine(x):  # Taylor approximation to sine function
    ans = currterm = x
    i = 0
    while np.abs(currterm) > 0.001:
        currterm = -currterm * x**2 / ((2 * i + 3) * (2 * i + 2))
        ans = ans + currterm
        i += 1
    return ans

grad_sine = grad(taylor_sine)
print("Gradient of sin(pi) is: ", grad_sine(np.pi))


# ## Complete Example: Logistic Regression
# A common use case for automatic differentiation is to train a probabilistic model. Here we present a very simple (but complete) example of specifying and training a logistic regression model for binary classification:

# In[39]:


def sigmoid(x):
    return 0.5*(np.tanh(x) + 1)

def logistic_predictions(weights, inputs):
    # Outputs probability of a label being true according to logistic model.
    return sigmoid(np.dot(inputs, weights))

def training_loss(weights):
    # Training loss is the negative log-likelihood of the training labels.
    preds = logistic_predictions(weights, inputs)
    label_probabilities = preds * targets + (1 - preds) * (1 - targets)
    return -np.sum(np.log(label_probabilities))

# Build a toy dataset.
inputs = np.array([[0.52, 1.12,  0.77],
                   [0.88, -1.08, 0.15],
                   [0.52, 0.06, -1.30],
                   [0.74, -2.49, 1.39]])
targets = np.array([True, True, False, True])

# Define a function that returns gradients of training loss using autograd.
training_gradient_fun = grad(training_loss)

# Optimize weights using gradient descent.
weights = np.array([0.0, 0.0, 0.0])
print("Initial loss:", training_loss(weights))
for i in range(100):
    weights -= training_gradient_fun(weights) * 0.01

print("Trained loss:", training_loss(weights))


# ## A Simple Neural Network
# A multi-layer perceptron for classification of MNIST handwritten digits

# In[43]:


from __future__ import absolute_import, division
from __future__ import print_function
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd.scipy.misc import logsumexp
from autograd import grad
from autograd.util import flatten
from autograd.optimizers import adam
from data import load_mnist


# In[44]:


def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples,
       one for each layer in the net."""
    return [(scale * rs.randn(m, n),   # weight matrix
             scale * rs.randn(n))      # bias vector
            for m, n in zip(layer_sizes[:-1], layer_sizes[1:])]


# In[45]:


def neural_net_predict(params, inputs):
    """Implements a deep neural network for classification.
       params is a list of (weights, bias) tuples.
       inputs is an (N x D) matrix.
       returns normalized class log-probabilities."""
    for W, b in params:
        outputs = np.dot(inputs, W) + b
        inputs = np.tanh(outputs)
    return outputs - logsumexp(outputs, axis=1, keepdims=True)


# In[46]:


def l2_norm(params):
    """Computes l2 norm of params by flattening them into a vector."""
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)


# In[47]:


def log_posterior(params, inputs, targets, L2_reg):
    log_prior = -L2_reg * l2_norm(params)
    log_lik = np.sum(neural_net_predict(params, inputs) * targets)
    return log_prior + log_lik


# In[48]:


def accuracy(params, inputs, targets):
    target_class    = np.argmax(targets, axis=1)
    predicted_class = np.argmax(neural_net_predict(params, inputs), axis=1)
    return np.mean(predicted_class == target_class)


# In[50]:


if __name__ == '__main__':
    # Model parameters
    layer_sizes = [784, 200, 100, 10]
    L2_reg = 1.0

    # Training parameters
    param_scale = 0.1
    batch_size = 256
    num_epochs = 10
    step_size = 0.001

    print("Loading training data...")
    N, train_images, train_labels, test_images,  test_labels = load_mnist()

    init_params = init_random_params(param_scale, layer_sizes)

    num_batches = int(np.ceil(len(train_images) / batch_size))
    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -log_posterior(params, train_images[idx], train_labels[idx], L2_reg)

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print("     Epoch     |    Train accuracy  |       Test accuracy  ")
    def print_perf(params, iter, gradient):
        if iter % num_batches == 0:
            train_acc = accuracy(params, train_images, train_labels)
            test_acc  = accuracy(params, test_images, test_labels)
            print("{:15}|{:20}|{:20}".format(iter//num_batches, train_acc, test_acc))

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad, init_params, step_size=step_size,
                            num_iters=num_epochs * num_batches, callback=print_perf)


# ### Coming up...
# 
# 1. **Convolutional Neural Net** Example
# 2. ** RNN - Recurrent Neural Net ** Example
# 3. ** LTSM - Long Short-Term Memory** Example
# 4. ** Backprop - Back Propagation with some fluid simulation** Example
# 5. ** GAN - Generative Adversarial Net** Example
# 6. ** Gaussian & Deep Gaussian** Example
# 7. ** Bayesian Neural Net** Example
# 8. And much more...