first attempt at training question and answer sequences

import numpy 
import pickle
import sys
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
def is_eof(f):
    s = f.read(1)
    if s != b'':    # restore position
        f.seek(-1, os.SEEK_CUR)
    return s == b''
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = numpy.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y
def feedData(fileName, batchSize, answersVocabSize, questionsVocabSize, questionsL, answersL):
    rf=open(fileName, 'r')
    batchCounter=0
    listQ=[]
    listAns=[]
    qTkn=Tokenizer()
    ansTkn=Tokenizer()
    with open('questionTokenizer.pkl', 'rb') as fp:
        qTkn=pickle.load(fp)
    with open('answerTokenizer.pkl', 'rb') as fp1:
        ansTkn=pickle.load(fp1)

    while True:
        while batchCounter<batchSize:
            listQ=[]
            listAns=[]
            lpq=[]
            lpAns=[]
            tl=""
            tl=rf.readline()
            textLine=tl.split("t#")
            if textLine is None:
                break
                textLine=rf.readline().split('t')
            encodedQuestions=qTkn.texts_to_sequences(textLine[0])
            encodedAnswers=ansTkn.texts_to_sequences(textLine[1])
            e=encode_output(encodedAnswers, answersVocabSize)
            
            listAns.append(e)
            listQ.append(encodedQuestions)
            print("batch counter=" + str(batchCounter))
            lpQ=[]
            lpAns=[]
            lpQ=sequence.pad_sequences(listQ, maxlen=questionsL, dtype='object', padding='post')
            lpAns=sequence.pad_sequences(listAns, maxlen=answersL, dtype='object', padding='post')
            yield (lpQ, lpAns)
            batchCounter=batchCounter+1
        batchCounter=0
    if is_eof(rf):
        return
    rf.close()
#end of routine

    
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    # compile model
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model
def max_length(lines):
    return max(len(line.split()) for line in lines)    
def tokenizePad(l1, l2):
    questionTokenizer=Tokenizer(num_words=60000)
    answerTokenizer=Tokenizer(num_words=60000)
    questionTokenizer.fit_on_texts(l1)
    answerTokenizer.fit_on_texts(l2)
    questionsVocabularySize=len(questionTokenizer.word_index) + 1
    answersVocabularySize=len(answerTokenizer.word_index) + 1
    questionsVocabularySize=60000
    answersVocabularySize=60000
    questionsLength=max_length(l1)
    answersLength=max_length(l2)
    print("questions vocabulary size=" + str(questionsVocabularySize))
    print("questions maximum length=" + str(questionsLength))
    print("answers vocabulary size=" + str(answersVocabularySize))
    print("answers length="+ str(answersLength))
    with open('questionTokenizer.pkl', 'wb') as fpp:
        pickle.dump(questionTokenizer,fpp)
    with open('answerTokenizer.pkl', 'wb') as fpp1:
        pickle.dump(answerTokenizer,fpp1)

    batch_size=32
    num_steps=200
    spe=len(l1)//(batch_size*num_steps)
    epochs=3
    model = define_model(questionsVocabularySize, answersVocabularySize, questionsLength, answersLength, batch_size)
    checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    model.fit_generator(generator=feedData('training.txt', batch_size, answersVocabularySize, questionsVocabularySize, questionsLength, answersLength), validation_data=feedData('testing.txt', batch_size, answersVocabularySize, questionsVocabularySize, questionsLength, answersLength), steps_per_epoch=spe, validation_steps=spe, epochs=epochs, callbacks=[checkpoint], verbose=2)
#load data sets
qs=[]
ans=[]
with open('questionsPure.pkl', 'rb') as fp:
    qs=pickle.load(fp)
with open('answersPure.pkl', 'rb') as fpp:
    ans=pickle.load(fpp)
tokenizePad(qs, ans)
print("program complete.")
Enter your email Address

techesoterica.com

A blog dealing with sensory substitution and other esoteric concepts and technologies like speech-recognition and chaos theory

first attempt at training question and answer sequences

Like this:

Related

About Pranav

Welcome to Techesoterica

Enter your email Address

Skip links

Like this:

Related

About Pranav

Welcome to Techesoterica