This is an implementation of a basic RNN (not bi-directional, not an LSTM) from scratch (using only numpy). This program will do character level text prediction. rnn

Thanks to Siraj Raval for the walkthrough here that was immensely helpful.

Import libraries and data

import numpy as np
import random
def import_data(file):
    data = open(file, 'r').read()
    chars = list(set(data))
    char_to_ix = {ch:i for i,ch in enumerate(chars)}
    ix_to_char = {i:ch for i,ch in enumerate(chars)}
    return data, chars, char_to_ix, ix_to_char
data, chars, char_to_ix, ix_to_char = import_data('kafka.txt')
def encode_one_hot(text, char_to_ix):
    indices = [char_to_ix[char] for char in text]
    vectors = np.zeros((len(indices),len(char_to_ix)))
    vectors[range(len(indices)), indices] = 1
    return vectors
def decode_one_hot(vectors, ix_to_char):
    text = ''
    for vector in vectors:
        index = np.argmax(vector)
        text += ix_to_char[index]
    return text
def input_to_target(vectors):
    return vectors[1:,:]
print(decode_one_hot(input_to_target(encode_one_hot(data, char_to_ix)),ix_to_char))
ne morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked.

"What's happened to me?" he thought. It wasn't a dream. His room, a proper human room although a little too small, lay peacefully between its four familiar walls. A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame. It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm towards the viewer...
vectors = encode_one_hot(data, char_to_ix)

Forward Prop

def initialize_parameters(vectors, hidden_size, seed=1):
    np.random.seed(seed)
    (data_length, vocab_size) = vectors.shape
    Whx = np.random.randn(hidden_size, vocab_size)*0.01
    Wha = np.random.randn(hidden_size, hidden_size)*0.01
    Wya = np.random.randn(vocab_size, hidden_size)*0.01
    bh = np.zeros((hidden_size,1))
    by = np.zeros((vocab_size,1))
    parameters = (Whx, Wha, Wya, bh, by)
    dWhx = np.zeros_like(Whx)
    dWha = np.zeros_like(Wha)
    dWya = np.zeros_like(Wya)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    dparameters = (dWhx, dWha, dWya, dbh, dby)
    mWhx = np.zeros_like(Whx)
    mWha = np.zeros_like(Wha)
    mWya = np.zeros_like(Wya)
    mbh = np.zeros_like(bh)
    mby = np.zeros_like(by)
    memories = (mWhx, mWha, mWya, mbh, mby)
    return parameters, dparameters, memories
def softmax(z):
    exp = np.exp(z)
    return exp/sum(exp)
def loss(y_hat, y):
    return -np.log(y_hat[np.where(y==1)])
def forward_step(a_t_minus_one, x, parameters):
    (Whx, Wha, Wya, bh, by) = parameters
    h = Wha.dot(a_t_minus_one) + Whx.dot(x[:,None]) + bh
    a = np.tanh(h)
    z = Wya.dot(a) + by
    y_hat = softmax(z)
    return y_hat, a
def forward_prop(X, Y, parameters, init_a):
    A={}
    A[-1] = init_a.copy()
    Y_hat = []
    losses = []
    for t in range(len(Y)):
        x = X[t,:]
        y = Y[t,:]
        y_hat, a = forward_step(A[t-1], x, parameters)
        A[t] = a
        Y_hat.append(y_hat)
        losses.append(loss(y_hat, y))
    return A, Y_hat, losses

Backward prop

def backward_step(x, y, y_hat, a, a_t_minus_one, da_next, parameters, dparameters):
    (dWhx, dWha, dWya, dbh, dby) = dparameters
    (Whx, Wha, Wya, bh, by) = parameters
    dy = np.copy(y_hat) - y
    dWya += dy.dot(a.T)
    dby += dy
    da = Wya.T.dot(dy) + da_next
    dh = (1.0-a**2)*da
    dWhx += dh.dot(x.T)
    dWha += dh.dot(a_t_minus_one.T)
    dbh += dh
    da_next = Wha.T.dot(dh)
    dparameters = (dWhx, dWha, dWya, dbh, dby)
    for dparameter in dparameters:
        np.clip(dparameter, -5, 5, out=dparameter)
    return da_next, dparameters
def backward_prop(X, Y, Y_hat, A, parameters, dparameters):
    da_next = np.zeros_like(dparameters[3])
    _, dparameters, _ = initialize_parameters(X, len(A[0]))
    for t in reversed(range(len(Y))):
        x = X[t,:][:,None]
        y = Y[t,:][:,None]
        y_hat = Y_hat[t]
        a = A[t]
        a_t_minus_one = A[t-1]
        da_next, dparameters = backward_step(x, y, y_hat, a, a_t_minus_one, da_next, parameters, dparameters)
    return parameters, dparameters

Training

def sequence_generator(a, parameters, char_to_ix, ix_to_char, seed_char=None, n=100):
    (Whx, Wha, Wya, bh, by) = parameters
    if seed_char is None:
        seed_char = ix_to_char[random.randrange(len(by))]
    if isinstance(seed_char,str):
        X = encode_one_hot(seed_char, char_to_ix)
        x = X[-1,:][:,None]
    else:
        x = seed_char[:,None]
    Y_hat = [x]
    for i in range(n):
        h = Whx.dot(x) + Wha.dot(a) + bh
        a = np.tanh(h)
        z = Wya.dot(a) + by
        y_hat = softmax(z)
        Y_hat.append(y_hat)
        x = (y_hat==y_hat.max()).astype(int)
    seq = decode_one_hot(Y_hat, ix_to_char)
    print(seq)
    return seq
testing = initialize_parameters(vectors, 100, seed=123)
sequence_generator(testing[0][3], testing[0], char_to_ix, ix_to_char, seed_char='A')
Aaxf1y@JuM:3;Rn0ui'y@JuM:3;Rn0ui'y@JuM:3;Rn0ui'y@JuM:3;Rn0ui'y@JuM:3;Rn0ui'y@JuM:3;Rn0ui'y@JuM:3;Rn0u
def train_rnn(data, hyperparameters, seed=1, init_parameters=None, use_init_parameters=False):

    # unpack hyperparameters
    hidden_size = hyperparameters['hidden_size']
    seq_length = hyperparameters['seq_length']
    learning_rate = hyperparameters['learning_rate']
    epochs = hyperparameters['epochs']
    sample_freq = hyperparameters['sample_freq']

    # generate vectors and vocab dictionaries
    chars = list(set(data))
    char_to_ix = {ch:i for i,ch in enumerate(chars)}
    ix_to_char = {i:ch for i,ch in enumerate(chars)}
    vectors = encode_one_hot(data, char_to_ix)
    target = input_to_target(vectors)

    # initialize parameters and smooth_loss
    parameters, dparameters, memories = initialize_parameters(vectors, hidden_size, seed=seed)
    smooth_loss = np.log(len(char_to_ix))*seq_length # loss at iteration 0
    init_a = np.zeros_like(parameters[3])
    if use_init_parameters==True:
        parameters = init_parameters
    p = 0 # position to start reading at
    # forward and backward prop
    for i in range(epochs):
        if p+seq_length>len(vectors): # start from the beginning of the book again
            p = 0
            init_a = np.zeros_like(parameters[3])
        X = vectors[p:p+seq_length]
        Y = target[p:p+seq_length]
        A, Y_hat, losses = forward_prop(X, Y, parameters, init_a)
        init_a = A[seq_length-1]
        if (i==0 or (i+1)%sample_freq == 0):
            print('Iter:', i+1, 'Loss:', smooth_loss)
            sequence_generator(init_a, parameters, char_to_ix, ix_to_char, seed_char=X[-1], n=100)
            print()
        parameters, dparameters = backward_prop(X, Y, Y_hat, A, parameters, dparameters)

        # update using adagrad
        for parameter, dparameter, memory in zip(parameters, dparameters, memories):
            memory += dparameter*dparameter
            parameter -= learning_rate*dparameter/np.sqrt(memory+1e-8)
        smooth_loss = smooth_loss* 0.999 + np.sum(losses)*0.001
        p+= seq_length

    return loss, parameters    
hyperparameters = {
    'hidden_size' : 150,
    'seq_length' : 35,
    'learning_rate' : 0.05,
    'epochs':10000,
    'sample_freq':1000
}
loss, parameters = train_rnn(data, hyperparameters, seed=1)
Iter: 1 Loss: 153.37093221358583
eG)XEk.Q:?-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-J?2FY-

Iter: 1000 Loss: 114.98703910985724
n he pere he pere he pere he pere he pere he pere he pere he pere he pere he pere he pere he pere he

Iter: 2000 Loss: 89.98119716264166
ed and and and and and and and and and and and and and and and and and and and and and and and and an

Iter: 3000 Loss: 78.24674308913039
 he was she the the the the the the the the the the the the the the the the the the the the the the t

Iter: 4000 Loss: 81.04244786556373
; the the the the the the the the the the the the the the the the the the the the the the the the the

Iter: 5000 Loss: 73.78070609787777
e the the the the the the the the the the the the the the the the the the the the the the the the the

Iter: 6000 Loss: 68.36697717802687
e him her and her and her and her and her and her and her and her and her and her and her and her and

Iter: 7000 Loss: 66.07226962796535
mand his for his for his for his for his for his for his for his for his for his for his for his for

Iter: 8000 Loss: 71.86785818263544
's and he was to he was to he was to he was to he was to he was to he was to he was to he was to he w

Iter: 9000 Loss: 67.36134684532409
wat the did the did the did the did the did the did the did the did the did the did the did the did t

Iter: 10000 Loss: 63.723388456472314
ware the was had his father was had his father was had his father was had his father was had his fath