import numpy as np import pandas as pd import sys from keras.preprocessing.text import Tokenizer from keras.preprocessing import sequence from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.utils.vis_utils import plot_model from keras.models import Sequential from keras.layers import LSTM from keras.layers import Dense from keras.layers import Embedding from keras.layers import RepeatVector from keras.layers import TimeDistributed from keras.callbacks import ModelCheckpoint import string def splitFrames(df, headSize) : hd = df.head(headSize) tl = df.tail(len(df)-headSize) return hd, tl def logData(logStr): with open('robo_train.log', 'a', encoding="utf8", errors="surrogateescape") as f: w=f.write(logStr + "n") f.close() def max_length(lines): return max(len(line.split()) for line in lines) # define NMT model def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units): model = Sequential() model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True)) model.add(LSTM(n_units)) model.add(RepeatVector(tar_timesteps)) model.add(LSTM(n_units, return_sequences=True)) model.add(TimeDistributed(Dense(tar_vocab, activation='softmax'))) # compile model model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # summarize defined model model.summary() plot_model(model, to_file='model.png', show_shapes=True) return model def feedData(qTokenizer, ansTokenizer, qVocabularySize, ansVocabularySize, qLength, ansLength, batchSize): rowCounter=1 totalRows=len(df_train.index) while True: x_list=np.empty((0,qLength)) y_list=np.empty((0,ansLength,1)) for lc in range(batchSize): logData("round " + str(lc)) logData("q vocabulary size within generator:" + str(qVocabularySize)) logData("Ans vocabulary size within generator:" + str(ansVocabularySize)) logData("q max seq length in generator:" + str(qLength)) logData("ans max seq length in generator:" + str(ansLength)) questionsCellVal=df_train.ix[rowCounter:rowCounter,"Questions"].to_list() answersCellVal=df_train.ix[rowCounter:rowCounter,"Answers"].to_list() questionSequences=qTokenizer.texts_to_sequences(questionsCellVal) answerSequences=ansTokenizer.texts_to_sequences(answersCellVal) questionsPadded=pad_sequences(questionSequences,qLength, padding='post') answersPadded=pad_sequences(answerSequences,ansLength, padding='post') y=np.array(answersPadded) y_train=np.expand_dims(y,axis=-1) x_train=np.array(questionsPadded) logData("shape of input:" + str(x_train.shape)) logData("shape of output:" + str(y_train.shape)) x_list=np.concatenate((x_list,x_train)) y_list=np.concatenate((y_list,y_train)) logData("x list shape:" + str(x_list.shape)) logData("y list shape:" + str(y_list.shape)) rowCounter=rowCounter+1 yield (x_list, y_list) if rowCounter>=totalRows: rowCounter=1 #load the data dfSource=pd.read_csv("qa_prime.csv", engine="python") dfSource.drop_duplicates(subset='Questions', inplace=True) dfSource.drop_duplicates(subset='Answers', inplace=True) df_train, df_validate=splitFrames(dfSource,2000) questionTokenizer=Tokenizer() questionTokenizer.fit_on_texts(df_train["Questions"]) answerTokenizer=Tokenizer() answerTokenizer.fit_on_texts(df_train["Answers"]) maxQ=max_length(df_train["Questions"].tolist()) maxAns=max_length(df_train["Answers"].tolist()) questionsVocabularySize=len(questionTokenizer.word_index) + 1 answersVocabularySize=len(answerTokenizer.word_index) + 1 btch_size=3 spe=int(len(df_train.index)/btch_size) logData("steps per epoch:" + str(spe)) model = define_model(questionsVocabularySize, answersVocabularySize, maxQ, maxAns, btch_size) checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='auto') #model.fit(questionsPadded, y_train, epochs=30, batch_size=32,validation_split=0.3, shuffle=True, callbacks=[checkpoint], verbose=2) model.fit_generator(feedData(questionTokenizer, answerTokenizer, questionsVocabularySize, answersVocabularySize, maxQ, maxAns,btch_size),epochs=100, verbose=2, steps_per_epoch=spe, use_multiprocessing=True,workers=6, callbacks=[checkpoint]) model.save('model.h5')