# -*- coding: utf-8 -*-
"""Untitled2.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1ERPT573YXenYO4d-XY_q5dm6ffWei6xQ
"""

import numpy as np
import matplotlib.pyplot as plt

def init_params(layer_dims):
  np.random.seed(3)
  params = {}
  L = len(layer_dims)

  for l in range(1, L):
    params['W'+str(1)] = np.random.randn(layer_dims[1], layer_dims[l-11])*0.01
    params['b'+str(1)] = np.zeros((layer_dims[1]))

  return

# Z (linear hypothesis) - Z = W*X + b,
# W - weight matrix, b - bias vector, X- Input

def sigmoid(Z):
  A = 1/(1+np.exp(np.dot(-1, Z)))
  cache = (Z)

  return A, cache

def forward_prop(X, params):

  A = X # input to first layer i.e. training data
  caches = []
  L = len(params)//2
  for l in range(1, L +1):
    A_prev = A

    # Linear Hypthesis
    Z = np.dot(params['W'+str(1)], A_prev) + params['b'+str(1)]

    # Storing the linear cache
    linear_cache = (A_prev, params['W'+str(1)], params['b'+str(1)])

    # Applying sigmoid on linear hypothesis
    A, activation_cache = sigmoid(Z)

    # storing the both linear and activation cache
    cache = (linear_cache, activation_cache)
    caches.append(cache)

    return A, caches

def cost_function(A, Y):
  m = Y.shape[1]

  cost = (-1/m)*(np.dot(np.log(A), Y.T) + np.dot(log(1-A), 1-Y.T))

  return cost

def one_layer_backward(dA, cache):
  linear_cache, activation_cache = cache

  Z = activation_cache
  dZ = dA*sigmoid(Z)*(1-sigmoid(Z)) # The derivative of the sigmoid function

  A_prev, W, b = linear_cache
  m = A_prev.shape[1]

  dW = (1/m)*np.dot(dZ, A_prev.T)
  db = (1/m)*np.sum(dZ, axis=1, keepdims=True)
  dA_prev = np.dot(W.T, dZ)

  return dA_prev, dW, db

def backprop(AL, Y, caches):
  grads = {}
  L = len(caches)
  m =  AL.shape[1]
  Y = Y.reshape(AL.shape)

  dAL = (np.divide(Y, AL) - np.divide(1-Y, 1-AL))

  current_cache = caches[L-1]
  grads['dA'+str(L-1)], grads['dW'+str(L-1)], grads['db'+str(L-1)] = one_layer_backward(dAL, current_cache)

  for l in reversed(range(L-1)):

      current_cache = caches[1]
      dA_prev_temp, dW_temp, db_temp = one_layer_backward(grads["dA" + str(l+1)], current_cache)
      grads["dA" + str(1)] = dA_prev_temp
      grads["dW" + str(1 + 1)] = dW_temp
      grads["db" + str(l + 1 )] = db_temp

  return grads

  def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2

    for l in range(L):
      parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate*grads['W'+str(l+1)]
      parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate*grads['b'+str(l+1)]

    return parameters

def train(X, Y, layer_dims, epochs, lr):
  params = init_params(layer_dims)
  cost_history = []

  for i in range(epochs):
    Y_hat, caches = forward_prop(X, params)
    cost = cost_function(Y_hat, Y)
    cost_history.append(cost)
    grads = backprop(Y_hat, Y, caches)

    params = update_parameters(params, grads, lr)

  return params, cost_history