training question answer sequences using pandas, keras and batching

import numpy as np 
import pandas as pd
import sys
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
import string
def splitFrames(df, headSize) :
    hd = df.head(headSize)
    tl = df.tail(len(df)-headSize)
    return hd, tl
def logData(logStr):
    with open('robo_train.log', 'a', encoding="utf8", errors="surrogateescape") as f:
        w=f.write(logStr + "n")
        f.close()



def max_length(lines):
    return max(len(line.split()) for line in lines)
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    # compile model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model





def feedData(qTokenizer, ansTokenizer, qVocabularySize, ansVocabularySize, qLength, ansLength, batchSize):
    rowCounter=1
    totalRows=len(df_train.index)
    while True:
        x_list=np.empty((0,qLength))
        
        y_list=np.empty((0,ansLength,1))
        for lc in range(batchSize):
            logData("round " + str(lc))
            logData("q vocabulary size within generator:" + str(qVocabularySize))
            logData("Ans vocabulary size within generator:" + str(ansVocabularySize))
            logData("q max seq length in generator:" + str(qLength))
            logData("ans max seq length in generator:" + str(ansLength))
            
            questionsCellVal=df_train.ix[rowCounter:rowCounter,"Questions"].to_list()
            answersCellVal=df_train.ix[rowCounter:rowCounter,"Answers"].to_list()
            questionSequences=qTokenizer.texts_to_sequences(questionsCellVal)
            answerSequences=ansTokenizer.texts_to_sequences(answersCellVal)
            questionsPadded=pad_sequences(questionSequences,qLength, padding='post')
            answersPadded=pad_sequences(answerSequences,ansLength, padding='post')
            y=np.array(answersPadded)
            y_train=np.expand_dims(y,axis=-1)
            x_train=np.array(questionsPadded)
            logData("shape of input:" + str(x_train.shape))
            logData("shape of output:" + str(y_train.shape))
            x_list=np.concatenate((x_list,x_train))
            y_list=np.concatenate((y_list,y_train))
            logData("x list shape:" + str(x_list.shape))
            logData("y list shape:" + str(y_list.shape))
            rowCounter=rowCounter+1
        yield (x_list, y_list)
        
        if rowCounter>=totalRows:
            rowCounter=1


#load the data
dfSource=pd.read_csv("qa_prime.csv", engine="python")
dfSource.drop_duplicates(subset='Questions', inplace=True)
dfSource.drop_duplicates(subset='Answers', inplace=True)
df_train, df_validate=splitFrames(dfSource,2000)
questionTokenizer=Tokenizer()
questionTokenizer.fit_on_texts(df_train["Questions"])
answerTokenizer=Tokenizer()
answerTokenizer.fit_on_texts(df_train["Answers"])
maxQ=max_length(df_train["Questions"].tolist())
maxAns=max_length(df_train["Answers"].tolist())
questionsVocabularySize=len(questionTokenizer.word_index) + 1
answersVocabularySize=len(answerTokenizer.word_index) + 1
btch_size=3
spe=int(len(df_train.index)/btch_size)
logData("steps per epoch:" + str(spe))
model = define_model(questionsVocabularySize, answersVocabularySize, maxQ, maxAns, btch_size)
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
#model.fit(questionsPadded, y_train, epochs=30, batch_size=32,validation_split=0.3, shuffle=True,  callbacks=[checkpoint], verbose=2)
model.fit_generator(feedData(questionTokenizer, answerTokenizer, questionsVocabularySize, answersVocabularySize, maxQ, maxAns,btch_size),epochs=100, verbose=2, steps_per_epoch=spe, use_multiprocessing=True,workers=6, callbacks=[checkpoint])  
model.save('model.h5')
Enter your email Address

techesoterica.com

A blog dealing with sensory substitution and other esoteric concepts and technologies like speech-recognition and chaos theory

training question answer sequences using pandas, keras and batching

Like this:

Related

About Pranav

Welcome to Techesoterica

Enter your email Address

Skip links

Like this:

Related

About Pranav

Welcome to Techesoterica