anujanegi
diff --git a/‎.gitattributes
+3 b/‎.gitattributes
+3
diff --git a/‎DeepRNN/base_model.py
+161 b/‎DeepRNN/base_model.py
+161
diff --git a/‎DeepRNN/config.py
+45 b/‎DeepRNN/config.py
+45
diff --git a/‎DeepRNN/dataset.py
+83 b/‎DeepRNN/dataset.py
+83
diff --git a/‎DeepRNN/main.py
+25 b/‎DeepRNN/main.py
+25
@@ -0,0 +1,3 @@
+*npy filter=lfs diff=lfs merge=lfs -text
+*csv filter=lfs diff=lfs merge=lfs -text
+*gz filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,161 @@
+import os
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import matplotlib.pyplot as plt
+import cPickle as pickle
+import copy
+import json
+from tqdm import tqdm
+
+from utils.nn import NN
+from utils.coco.coco import COCO
+from utils.coco.pycocoevalcap.eval import COCOEvalCap
+from utils.misc import ImageLoader, CaptionData, TopN
+
+class BaseModel(object):
+    def __init__(self, config):
+        self.config = config
+        self.is_train = True if config.phase == 'train' else False
+        self.train_cnn = self.is_train and config.train_cnn
+        self.image_loader = ImageLoader('./DeepRNN/utils/ilsvrc_2012_mean.npy')
+        self.image_shape = [224, 224, 3]
+        self.nn = NN(config)
+        self.global_step = tf.Variable(0,
+                                       name = 'global_step',
+                                       trainable = False)
+        self.build()
+
+    def build(self):
+        raise NotImplementedError()
+
+    def test(self, sess, test_data, vocabulary):
+        """ Test the model using any given images. """
+        config = self.config
+
+        # Generate the captions for the images
+        for k in tqdm(list(range(test_data.num_batches)), desc='path'):
+            batch = test_data.next_batch()
+            caption_data = self.beam_search(sess, batch, vocabulary)
+
+            fake_cnt = 0 if k<test_data.num_batches-1 \
+                         else test_data.fake_count
+            for l in range(test_data.batch_size-fake_cnt):
+                word_idxs = caption_data[l][0].sentence
+                score = caption_data[l][0].score
+                caption = vocabulary.get_sentence(word_idxs)
+                print('**'+caption+'**')
+
+    def beam_search(self, sess, image_files, vocabulary):
+        """Use beam search to generate the captions for a batch of images."""
+        # Feed in the images to get the contexts and the initial LSTM states
+        config = self.config
+        images = self.image_loader.load_images(image_files)
+        contexts, initial_memory, initial_output = sess.run(
+            [self.conv_feats, self.initial_memory, self.initial_output],
+            feed_dict = {self.images: images})
+
+        partial_caption_data = []
+        complete_caption_data = []
+        for k in range(config.batch_size):
+            initial_beam = CaptionData(sentence = [],
+                                       memory = initial_memory[k],
+                                       output = initial_output[k],
+                                       score = 1.0)
+            partial_caption_data.append(TopN(config.beam_size))
+            partial_caption_data[-1].push(initial_beam)
+            complete_caption_data.append(TopN(config.beam_size))
+
+        # Run beam search
+        for idx in range(config.max_caption_length):
+            partial_caption_data_lists = []
+            for k in range(config.batch_size):
+                data = partial_caption_data[k].extract()
+                partial_caption_data_lists.append(data)
+                partial_caption_data[k].reset()
+
+            num_steps = 1 if idx == 0 else config.beam_size
+            for b in range(num_steps):
+                if idx == 0:
+                    last_word = np.zeros((config.batch_size), np.int32)
+                else:
+                    last_word = np.array([pcl[b].sentence[-1]
+                                        for pcl in partial_caption_data_lists],
+                                        np.int32)
+
+                last_memory = np.array([pcl[b].memory
+                                        for pcl in partial_caption_data_lists],
+                                        np.float32)
+                last_output = np.array([pcl[b].output
+                                        for pcl in partial_caption_data_lists],
+                                        np.float32)
+
+                memory, output, scores = sess.run(
+                    [self.memory, self.output, self.probs],
+                    feed_dict = {self.contexts: contexts,
+                                 self.last_word: last_word,
+                                 self.last_memory: last_memory,
+                                 self.last_output: last_output})
+
+                # Find the beam_size most probable next words
+                for k in range(config.batch_size):
+                    caption_data = partial_caption_data_lists[k][b]
+                    words_and_scores = list(enumerate(scores[k]))
+                    words_and_scores.sort(key=lambda x: -x[1])
+                    words_and_scores = words_and_scores[0:config.beam_size+1]
+
+                    # Append each of these words to the current partial caption
+                    for w, s in words_and_scores:
+                        sentence = caption_data.sentence + [w]
+                        score = caption_data.score * s
+                        beam = CaptionData(sentence,
+                                           memory[k],
+                                           output[k],
+                                           score)
+                        if vocabulary.words[w] == '.':
+                            complete_caption_data[k].push(beam)
+                        else:
+                            partial_caption_data[k].push(beam)
+
+        results = []
+        for k in range(config.batch_size):
+            if complete_caption_data[k].size() == 0:
+                complete_caption_data[k] = partial_caption_data[k]
+            results.append(complete_caption_data[k].extract(sort=True))
+
+        return results
+
+    def load(self, sess, model_file=None):
+        """ Load the model. """
+        config = self.config
+        if model_file is not None:
+            save_path = model_file
+        else:
+            info_path = os.path.join(config.save_dir, "config.pickle")
+            info_file = open(info_path, "rb")
+            config = pickle.load(info_file)
+            global_step = config.global_step
+            info_file.close()
+            save_path = os.path.join(config.save_dir,
+                                     str(global_step)+".npy")
+
+        data_dict = np.load(save_path).item()
+        count = 0
+        for v in tqdm(tf.global_variables()):
+            if v.name in data_dict.keys():
+                sess.run(v.assign(data_dict[v.name]))
+                count += 1
+
+    def load_cnn(self, session, data_path, ignore_missing=True):
+        """ Load a pretrained CNN model. """
+        data_dict = np.load(data_path).item()
+        count = 0
+        for op_name in tqdm(data_dict):
+            with tf.variable_scope(op_name, reuse = True):
+                for param_name, data in data_dict[op_name].iteritems():
+                    try:
+                        var = tf.get_variable(param_name)
+                        session.run(var.assign(data))
+                        count += 1
+                    except ValueError:
+                        pass
@@ -0,0 +1,45 @@
+
+class Config(object):
+    """ Wrapper class for various (hyper)parameters. """
+    def __init__(self):
+        # about the model architecture
+        self.cnn = 'vgg16'               # 'vgg16' or 'resnet50'
+        self.max_caption_length = 20
+        self.dim_embedding = 512
+        self.num_lstm_units = 512
+        self.num_initalize_layers = 2    # 1 or 2
+        self.dim_initalize_layer = 512
+        self.num_attend_layers = 2       # 1 or 2
+        self.dim_attend_layer = 512
+        self.num_decode_layers = 2       # 1 or 2
+        self.dim_decode_layer = 1024
+
+        # about the weight initialization and regularization
+        self.fc_kernel_initializer_scale = 0.08
+        self.fc_kernel_regularizer_scale = 1e-4
+        self.fc_activity_regularizer_scale = 0.0
+        self.conv_kernel_regularizer_scale = 1e-4
+        self.conv_activity_regularizer_scale = 0.0
+        self.fc_drop_rate = 0.5
+        self.lstm_drop_rate = 0.3
+        self.attention_loss_factor = 0.01
+
+        # about the optimization
+        self.num_epochs = 100
+        self.batch_size = 32
+        self.optimizer = 'Adam'    # 'Adam', 'RMSProp', 'Momentum' or 'SGD'
+        self.initial_learning_rate = 0.0001
+        self.learning_rate_decay_factor = 1.0
+        self.num_steps_per_decay = 100000
+        self.momentum = 0.0
+        self.clip_gradients = 5.0
+        self.use_nesterov = True
+        self.decay = 0.9
+        self.centered = True
+        self.beta2 = 0.999
+        self.beta1 = 0.9
+        self.epsilon = 1e-6
+
+        # about the vocabulary
+        self.vocabulary_file = './data/vocabulary.csv'
+        self.vocabulary_size = 5000
@@ -0,0 +1,83 @@
+import os
+import math
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from utils.coco.coco import COCO
+from utils.vocabulary import Vocabulary
+
+class DataSet(object):
+    def __init__(self,
+                 image_ids,
+                 image_files,
+                 batch_size,
+                 word_idxs=None,
+                 masks=None,
+                 is_train=False,
+                 shuffle=False):
+        self.image_ids = np.array(image_ids)
+        self.image_files = np.array(image_files)
+        self.word_idxs = np.array(word_idxs)
+        self.masks = np.array(masks)
+        self.batch_size = batch_size
+        self.is_train = is_train
+        self.shuffle = shuffle
+        self.setup()
+
+    def setup(self):
+        """ Setup the dataset. """
+        self.count = len(self.image_ids)
+        self.num_batches = int(np.ceil(self.count * 1.0 / self.batch_size))
+        self.fake_count = self.num_batches * self.batch_size - self.count
+        self.idxs = list(range(self.count))
+        self.reset()
+
+    def reset(self):
+        """ Reset the dataset. """
+        self.current_idx = 0
+        if self.shuffle:
+            np.random.shuffle(self.idxs)
+
+    def next_batch(self):
+        """ Fetch the next batch. """
+        assert self.has_next_batch()
+
+        if self.has_full_next_batch():
+            start, end = self.current_idx, \
+                         self.current_idx + self.batch_size
+            current_idxs = self.idxs[start:end]
+        else:
+            start, end = self.current_idx, self.count
+            current_idxs = self.idxs[start:end] + \
+                           list(np.random.choice(self.count, self.fake_count))
+
+        image_files = self.image_files[current_idxs]
+        if self.is_train:
+            word_idxs = self.word_idxs[current_idxs]
+            masks = self.masks[current_idxs]
+            self.current_idx += self.batch_size
+            return image_files, word_idxs, masks
+        else:
+            self.current_idx += self.batch_size
+            return image_files
+
+    def has_next_batch(self):
+        """ Determine whether there is a batch left. """
+        return self.current_idx < self.count
+
+    def has_full_next_batch(self):
+        """ Determine whether there is a full batch left. """
+        return self.current_idx + self.batch_size <= self.count
+
+def prepare_test_data(config):
+    """ Prepare the data for testing the model. """
+    image_files = [config.test_file_name]
+    image_ids = list(range(len(image_files)))
+    if os.path.exists(config.vocabulary_file):
+        vocabulary = Vocabulary(config.vocabulary_size,
+                                config.vocabulary_file)
+    else:
+        vocabulary = build_vocabulary(config)
+    dataset = DataSet(image_ids, image_files, config.batch_size)
+    return dataset, vocabulary
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+import tensorflow as tf
+from config import Config
+from model import CaptionGenerator
+from dataset import prepare_test_data
+
+flags = tf.app.flags.FLAGS
+
+tf.flags.DEFINE_string('test_image', 'image.jpg', 'Test image name')
+
+def main(argv):
+    config = Config()
+    config.test_file_name = flags.test_image
+    config.phase = 'test'
+    config.beam_size = 3
+
+    with tf.Session() as sess:
+        data, vocabulary = prepare_test_data(config)
+        model = CaptionGenerator(config)
+        model.load(sess, './data/289999.npy')
+        tf.get_default_graph().finalize()
+        model.test(sess, data, vocabulary)
+
+if __name__ == '__main__':
+    tf.app.run()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+*npy filter=lfs diff=lfs merge=lfs -text`
	`2`	`+*csv filter=lfs diff=lfs merge=lfs -text`
	`3`	`+*gz filter=lfs diff=lfs merge=lfs -text`