diff --git a/training/train.py b/training/train.py index f2dd0cc..ea7aa28 100644 --- a/training/train.py +++ b/training/train.py @@ -3,27 +3,67 @@ """ import theano import theano.tensor as tensor - -import cPickle as pkl +#import Pickle as pkl import numpy import copy - import os +import tensorflow as tf +from collections import Counter +import _pickle as pkl import warnings +warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') +import collections import sys +from skip_thoughts.data import special_words import time - +#import pickle import homogeneous_data - +#from special_words import * from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams -from utils import * +#from utils import* from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer from optim import adam from model import init_params, build_model from vocab import load_dictionary +input_directory='C:/Users/Hp 15/Desktop/pos/' +input_files1=glob.glob(input_directory+"*.txt") +for path in input_files1: + def del_all_flags(FLAGS): + flags_dict = FLAGS._flags() + keys_list = [keys for keys in flags_dict] + for keys in keys_list: + FLAGS.__delattr__(keys) + + del_all_flags(tf.flags.FLAGS) + # because path is object not string + input_files= str(path) +FLAGS = tf.flags.FLAGS +word_idict="C:/Users/Hp 15/Desktop/vocab/vocab.txt" +tf.flags.DEFINE_string("vocab_file", word_idict, + "(Optional) existing vocab file. Otherwise, a new vocab " + "file is created and written to the output directory. " + "The file format is a list of newline-separated words, " + "where the word id is the corresponding 0-based index " + "in the file.") + +def _build_vocabulary(input_files): + + if FLAGS.vocab_file: + + tf.logging.info("Loading existing word_idict file.") + word_idict = collections.OrderedDict() + with tf.gfile.GFile(FLAGS.vocab_file, mode="r") as f: + for i, line in enumerate(f): + word = line.strip() + assert word not in word_idict, "Attempting to add word twice: %s" % word + word_idict[word] = i + tf.logging.info("Read word_idict of size %d from %s", + len(word_idict), FLAGS.vocab_file) + return word_idict + + -# main trainer def trainer(X, dim_word=620, # word vector dimensionality dim=2400, # the number of GRU units @@ -37,11 +77,12 @@ def trainer(X, maxlen_w=30, optimizer='adam', batch_size = 64, - saveto='/u/rkiros/research/semhash/models/toy.npz', - dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl', + saveto='C:/Users/Hp 15/Desktop/vocab/', + + dictionary='C:/Users/Hp 15/Desktop/vocab/vocab.txt', saveFreq=1000, reload_=False): - + print ("Done heloooooooshabnam") # Model options model_options = {} model_options['dim_word'] = dim_word @@ -61,26 +102,45 @@ def trainer(X, model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ - print model_options + print (model_options) # reload options if reload_ and os.path.exists(saveto): - print 'reloading...' + saveto - with open('%s.pkl'%saveto, 'rb') as f: - models_options = pkl.load(f) - - # load dictionary - print 'Loading dictionary...' + print ("reloading...") + saveto + + + file = open('C:/Users/Hp 15/Desktop/vocab/vocab.txt', "r") + models_options=file.read() +# with open('C:/Users/Hp 15/Desktop/vocab/vocab.txt', 'r') as f: +# models_options =pkl.load(f) + file = open('C:/Users/Hp 15/Desktop/vocab/word_counts.txt', "r") + wordcount=file.read() + words = list(wordcount) + freqs = Counter(file.read().split()) +# freqs = list(wordcount.values()) + sorted_indices = np.argsort(freqs)[::-1] +#have to look again # load dictionary + print ("Loading dictionary...") worddict = load_dictionary(dictionary) # Inverse dictionary word_idict = dict() - for kk, vv in worddict.iteritems(): - word_idict[vv] = kk - word_idict[0] = '' - word_idict[1] = 'UNK' - - print 'Building model' + word_idict = collections.OrderedDict() + word_idict[special_words.EOS] = special_words.EOS_ID + word_idict[special_words.UNK] = special_words.UNK_ID + for kk , vv in enumerate(sorted_indices[0:-2]): + word_idict[words[vv]] = kk + 2 # 0: EOS, 1: UNK. + + + + +# +# for kk, vv in worddict.iteritems(): +# word_idict[vv] = kk +# word_idict[0] = '' +# word_idict[1] = 'UNK' + + print ("Building model") params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): @@ -95,9 +155,9 @@ def trainer(X, inps = [x, x_mask, y, y_mask, z, z_mask] # before any regularizer - print 'Building f_log_probs...', + print ("Building f_log_probs..."), f_log_probs = theano.function(inps, cost, profile=False) - print 'Done' + print ("Done") # weight decay, if applicable if decay_c > 0.: @@ -109,12 +169,12 @@ def trainer(X, cost += weight_decay # after any regularizer - print 'Building f_cost...', + print ("Building f_cost..."), f_cost = theano.function(inps, cost, profile=False) - print 'Done' + print ("Done") - print 'Done' - print 'Building f_grad...', + print ("Done") + print ("Building f_grad..."), grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) @@ -131,11 +191,11 @@ def trainer(X, grads = new_grads lr = tensor.scalar(name='lr') - print 'Building optimizers...', + print ("Building optimizers..."), # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) - print 'Optimization' + print ("Optimization") # Each sentence in the minibatch have same length (for encoder) trainX = homogeneous_data.grouper(X) @@ -146,7 +206,7 @@ def trainer(X, for eidx in xrange(max_epochs): n_samples = 0 - print 'Epoch ', eidx + print ("Epoch "), eidx for x, y, z in train_iter: n_samples += len(x) @@ -155,7 +215,7 @@ def trainer(X, x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: - print 'Minibatch with zero sample under length ', maxlen_w + print ("Minibatch with zero sample under length "), maxlen_w uidx -= 1 continue @@ -165,23 +225,46 @@ def trainer(X, ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): - print 'NaN detected' + print ("NaN detected") return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: - print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud + print ("Epoch"), eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, saveFreq) == 0: - print 'Saving...', + print ("Saving..."), params = unzip(tparams) numpy.savez(saveto, history_errs=[], **params) - pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) - print 'Done' + pkl.dump(model_options, open('C:/Users/Hp 15/Desktop/vocab/', 'wb')) + print ("Done") - print 'Seen %d samples'%n_samples + print ("Seen %d samples")+n_samples if __name__ == '__main__': + print ("Done shabnam") + file = open('C:/Users/Hp 15/Desktop/11525.txt', "r") + X=file.read() + trainer(X, + dim_word=620, # word vector dimensionality + dim=2400, # the number of GRU units + encoder='gru', + decoder='gru', + max_epochs=5, + dispFreq=1, + decay_c=0., + grad_clip=5., + n_words=20000, + maxlen_w=30, + optimizer='adam', + batch_size = 64, + saveto='C:/Users/Hp 15/Desktop/vocab/', + + dictionary='C:/Users/Hp 15/Desktop/vocab/vocab.txt', + saveFreq=1000, + reload_=False) + print (".npz shabnam") pass +