ryankiros · shabnamrani31 · Nov 16, 2018
diff --git a/training/train.py b/training/train.py
@@ -3,27 +3,67 @@
 """
 import theano
 import theano.tensor as tensor
-
-import cPickle as pkl
+#import Pickle as pkl
 import numpy
 import copy
-
 import os
+import tensorflow as tf
+from collections import Counter
+import _pickle as pkl
 import warnings
+warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
+import collections
 import sys
+from skip_thoughts.data import special_words
 import time
-
+#import pickle
 import homogeneous_data
-
+#from special_words import *
 from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
-from utils import *
+#from utils import*
 from layers import get_layer, param_init_fflayer, fflayer, param_init_gru, gru_layer
 from optim import adam
 from model import init_params, build_model
 from vocab import load_dictionary
+input_directory='C:/Users/Hp 15/Desktop/pos/'
+input_files1=glob.glob(input_directory+"*.txt")
+for path in input_files1:
+    def del_all_flags(FLAGS):
+        flags_dict = FLAGS._flags()    
+        keys_list = [keys for keys in flags_dict]    
+        for keys in keys_list:
+            FLAGS.__delattr__(keys)
+
+    del_all_flags(tf.flags.FLAGS)
+    # because path is object not string
+    input_files= str(path)
+FLAGS = tf.flags.FLAGS
+word_idict="C:/Users/Hp 15/Desktop/vocab/vocab.txt"
+tf.flags.DEFINE_string("vocab_file", word_idict,
+                       "(Optional) existing vocab file. Otherwise, a new vocab "
+                       "file is created and written to the output directory. "
+                       "The file format is a list of newline-separated words, "
+                       "where the word id is the corresponding 0-based index "
+                       "in the file.")
+
+def _build_vocabulary(input_files):
+
+    if FLAGS.vocab_file:
+
+       tf.logging.info("Loading existing word_idict file.")
+       word_idict = collections.OrderedDict()
+       with tf.gfile.GFile(FLAGS.vocab_file, mode="r") as f:
+           for i, line in enumerate(f):
+                word = line.strip()
+                assert word not in word_idict, "Attempting to add word twice: %s" % word
+                word_idict[word] = i
+           tf.logging.info("Read word_idict of size %d from %s",
+                    len(word_idict), FLAGS.vocab_file)
+           return word_idict
+
+
 
-# main trainer
 def trainer(X, 
             dim_word=620, # word vector dimensionality
             dim=2400, # the number of GRU units
@@ -37,11 +77,12 @@ def trainer(X,
             maxlen_w=30,
             optimizer='adam',
             batch_size = 64,
-            saveto='/u/rkiros/research/semhash/models/toy.npz',
-            dictionary='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl',
+            saveto='C:/Users/Hp 15/Desktop/vocab/',
+
+            dictionary='C:/Users/Hp 15/Desktop/vocab/vocab.txt',
             saveFreq=1000,
             reload_=False):
-
+    print ("Done heloooooooshabnam")
     # Model options
     model_options = {}
     model_options['dim_word'] = dim_word
@@ -61,26 +102,45 @@ def trainer(X,
     model_options['saveFreq'] = saveFreq
     model_options['reload_'] = reload_
 
-    print model_options
+    print (model_options)
 
     # reload options
     if reload_ and os.path.exists(saveto):
-        print 'reloading...' + saveto
-        with open('%s.pkl'%saveto, 'rb') as f:
-            models_options = pkl.load(f)
-
-    # load dictionary
-    print 'Loading dictionary...'
+        print ("reloading...") + saveto
+
+
+        file = open('C:/Users/Hp 15/Desktop/vocab/vocab.txt', "r") 
+        models_options=file.read()
+#        with open('C:/Users/Hp 15/Desktop/vocab/vocab.txt', 'r') as f:
+#            models_options =pkl.load(f)
+    file = open('C:/Users/Hp 15/Desktop/vocab/word_counts.txt', "r") 
+    wordcount=file.read()
+    words = list(wordcount)
+    freqs = Counter(file.read().split())
+#    freqs = list(wordcount.values())
+    sorted_indices = np.argsort(freqs)[::-1] 
+#have to look again    # load dictionary
+    print ("Loading dictionary...")
     worddict = load_dictionary(dictionary)
 
     # Inverse dictionary
     word_idict = dict()
-    for kk, vv in worddict.iteritems():
-        word_idict[vv] = kk
-    word_idict[0] = '<eos>'
-    word_idict[1] = 'UNK'
-
-    print 'Building model'
+    word_idict = collections.OrderedDict()
+    word_idict[special_words.EOS] = special_words.EOS_ID
+    word_idict[special_words.UNK] = special_words.UNK_ID
+    for kk , vv in enumerate(sorted_indices[0:-2]):
+       word_idict[words[vv]] = kk + 2  # 0: EOS, 1: UNK.
+
+
+
+
+#    
+#    for kk, vv in worddict.iteritems():
+#        word_idict[vv] = kk
+#    word_idict[0] = '<eos>'
+#    word_idict[1] = 'UNK'
+
+    print ("Building model")
     params = init_params(model_options)
     # reload parameters
     if reload_ and os.path.exists(saveto):
@@ -95,9 +155,9 @@ def trainer(X,
     inps = [x, x_mask, y, y_mask, z, z_mask]
 
     # before any regularizer
-    print 'Building f_log_probs...',
+    print ("Building f_log_probs..."),
     f_log_probs = theano.function(inps, cost, profile=False)
-    print 'Done'
+    print ("Done")
 
     # weight decay, if applicable
     if decay_c > 0.:
@@ -109,12 +169,12 @@ def trainer(X,
         cost += weight_decay
 
     # after any regularizer
-    print 'Building f_cost...',
+    print ("Building f_cost..."),
     f_cost = theano.function(inps, cost, profile=False)
-    print 'Done'
+    print ("Done")
 
-    print 'Done'
-    print 'Building f_grad...',
+    print ("Done")
+    print ("Building f_grad..."),
     grads = tensor.grad(cost, wrt=itemlist(tparams))
     f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False)
     f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False)
@@ -131,11 +191,11 @@ def trainer(X,
         grads = new_grads
 
     lr = tensor.scalar(name='lr')
-    print 'Building optimizers...',
+    print ("Building optimizers..."),
     # (compute gradients), (updates parameters)
     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
 
-    print 'Optimization'
+    print ("Optimization")
 
     # Each sentence in the minibatch have same length (for encoder)
     trainX = homogeneous_data.grouper(X)
@@ -146,7 +206,7 @@ def trainer(X,
     for eidx in xrange(max_epochs):
         n_samples = 0
 
-        print 'Epoch ', eidx
+        print ("Epoch "), eidx
 
         for x, y, z in train_iter:
             n_samples += len(x)
@@ -155,7 +215,7 @@ def trainer(X,
             x, x_mask, y, y_mask, z, z_mask = homogeneous_data.prepare_data(x, y, z, worddict, maxlen=maxlen_w, n_words=n_words)
 
             if x == None:
-                print 'Minibatch with zero sample under length ', maxlen_w
+                print ("Minibatch with zero sample under length "), maxlen_w
                 uidx -= 1
                 continue
 
@@ -165,23 +225,46 @@ def trainer(X,
             ud = time.time() - ud_start
 
             if numpy.isnan(cost) or numpy.isinf(cost):
-                print 'NaN detected'
+                print ("NaN detected")
                 return 1., 1., 1.
 
             if numpy.mod(uidx, dispFreq) == 0:
-                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
+                print ("Epoch"), eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
 
             if numpy.mod(uidx, saveFreq) == 0:
-                print 'Saving...',
+                print ("Saving..."),
 
                 params = unzip(tparams)
                 numpy.savez(saveto, history_errs=[], **params)
-                pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
-                print 'Done'
+                pkl.dump(model_options, open('C:/Users/Hp 15/Desktop/vocab/', 'wb'))
+                print ("Done")
 
-        print 'Seen %d samples'%n_samples
+        print ("Seen %d samples")+n_samples
 
 if __name__ == '__main__':
+    print ("Done shabnam")
+    file = open('C:/Users/Hp 15/Desktop/11525.txt', "r") 
+    X=file.read()
+    trainer(X, 
+            dim_word=620, # word vector dimensionality
+            dim=2400, # the number of GRU units
+            encoder='gru',
+            decoder='gru',
+            max_epochs=5,
+            dispFreq=1,
+            decay_c=0.,
+            grad_clip=5.,
+            n_words=20000,
+            maxlen_w=30,
+            optimizer='adam',
+            batch_size = 64,
+            saveto='C:/Users/Hp 15/Desktop/vocab/',
+
+            dictionary='C:/Users/Hp 15/Desktop/vocab/vocab.txt',
+            saveFreq=1000,
+            reload_=False)
+    print (".npz shabnam")
     pass
 
 
+