import numpy
import pickle
import sys
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
def is_eof(f):
s = f.read(1)
if s != b'': # restore position
f.seek(-1, os.SEEK_CUR)
return s == b''
def encode_output(sequences, vocab_size):
ylist = list()
for sequence in sequences:
encoded = to_categorical(sequence, num_classes=vocab_size)
ylist.append(encoded)
y = numpy.array(ylist)
y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
return y
def feedData(fileName, batchSize, answersVocabSize, questionsVocabSize, questionsL, answersL):
rf=open(fileName, 'r')
batchCounter=0
listQ=[]
listAns=[]
qTkn=Tokenizer()
ansTkn=Tokenizer()
with open('questionTokenizer.pkl', 'rb') as fp:
qTkn=pickle.load(fp)
with open('answerTokenizer.pkl', 'rb') as fp1:
ansTkn=pickle.load(fp1)
while True:
while batchCounter<batchSize:
listQ=[]
listAns=[]
lpq=[]
lpAns=[]
tl=""
tl=rf.readline()
textLine=tl.split("t#")
if textLine is None:
break
textLine=rf.readline().split('t')
encodedQuestions=qTkn.texts_to_sequences(textLine[0])
encodedAnswers=ansTkn.texts_to_sequences(textLine[1])
e=encode_output(encodedAnswers, answersVocabSize)
listAns.append(e)
listQ.append(encodedQuestions)
print("batch counter=" + str(batchCounter))
lpQ=[]
lpAns=[]
lpQ=sequence.pad_sequences(listQ, maxlen=questionsL, dtype='object', padding='post')
lpAns=sequence.pad_sequences(listAns, maxlen=answersL, dtype='object', padding='post')
yield (lpQ, lpAns)
batchCounter=batchCounter+1
batchCounter=0
if is_eof(rf):
return
rf.close()
#end of routine
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
model = Sequential()
model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
model.add(LSTM(n_units))
model.add(RepeatVector(tar_timesteps))
model.add(LSTM(n_units, return_sequences=True))
model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
# compile model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
model.summary()
plot_model(model, to_file='model.png', show_shapes=True)
return model
def max_length(lines):
return max(len(line.split()) for line in lines)
def tokenizePad(l1, l2):
questionTokenizer=Tokenizer(num_words=60000)
answerTokenizer=Tokenizer(num_words=60000)
questionTokenizer.fit_on_texts(l1)
answerTokenizer.fit_on_texts(l2)
questionsVocabularySize=len(questionTokenizer.word_index) + 1
answersVocabularySize=len(answerTokenizer.word_index) + 1
questionsVocabularySize=60000
answersVocabularySize=60000
questionsLength=max_length(l1)
answersLength=max_length(l2)
print("questions vocabulary size=" + str(questionsVocabularySize))
print("questions maximum length=" + str(questionsLength))
print("answers vocabulary size=" + str(answersVocabularySize))
print("answers length="+ str(answersLength))
with open('questionTokenizer.pkl', 'wb') as fpp:
pickle.dump(questionTokenizer,fpp)
with open('answerTokenizer.pkl', 'wb') as fpp1:
pickle.dump(answerTokenizer,fpp1)
batch_size=32
num_steps=200
spe=len(l1)//(batch_size*num_steps)
epochs=3
model = define_model(questionsVocabularySize, answersVocabularySize, questionsLength, answersLength, batch_size)
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit_generator(generator=feedData('training.txt', batch_size, answersVocabularySize, questionsVocabularySize, questionsLength, answersLength), validation_data=feedData('testing.txt', batch_size, answersVocabularySize, questionsVocabularySize, questionsLength, answersLength), steps_per_epoch=spe, validation_steps=spe, epochs=epochs, callbacks=[checkpoint], verbose=2)
#load data sets
qs=[]
ans=[]
with open('questionsPure.pkl', 'rb') as fp:
qs=pickle.load(fp)
with open('answersPure.pkl', 'rb') as fpp:
ans=pickle.load(fpp)
tokenizePad(qs, ans)
print("program complete.")