DeepVoice3 for Automatic Image Captioning


In [0]:
import os
from os.path import exists, join, expanduser

# Clone
name = "deepvoice3_pytorch"

if not exists(name):
  ! git clone$name
In [0]:
% cd /content/deepvoice3_pytorch
!git checkout 7a10ac6763eda92595e257543494b6a95f64229b --quiet
In [0]:
%pylab inline
! pip install -q librosa nltk

import torch
import numpy as np
import librosa
import librosa.display
import IPython
from IPython.display import Audio

# need this for English text processing frontend
import nltk
! python -m nltk.downloader cmudict

Download a pre-trained model

Grab the checkpoint and configuration from dropbox.

In [0]:
preset = "20180505_deepvoice3_ljspeech.json"
checkpoint_path = "20180505_deepvoice3_checkpoint_step000640000.pth"
In [0]:
if not exists(preset):
  !curl -O -L ""
if not exists(checkpoint_path):
  !curl -O -L ""
In [0]:
! pip install -e '.[train]'
! pip install nnmnkwii
! pip install tensorboardX


Setup hyper parameters

In [0]:
import hparams
import json

# Load parameters from preset
with open(preset) as f:
# Inject frontend text processor
import synthesis
import train
from deepvoice3_pytorch import frontend
synthesis._frontend = getattr(frontend, "en")
train._frontend =  getattr(frontend, "en")

# alises
fs = hparams.hparams.sample_rate
hop_length = hparams.hparams.hop_size

Define utility functions

In [0]:
def tts(model, text, p=0, speaker_id=None, fast=True, figures=True):
  from synthesis import tts as _tts
  waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
  if figures:
      visualize(alignment, spectrogram)
  IPython.display.display(Audio(waveform, rate=fs))
def visualize(alignment, spectrogram):
  label_fontsize = 16

  imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
  xlabel("Decoder timestamp", fontsize=label_fontsize)
  ylabel("Encoder timestamp", fontsize=label_fontsize)

  librosa.display.specshow(spectrogram.T, sr=fs, 
                           hop_length=hop_length, x_axis="time", y_axis="linear")
  xlabel("Time", fontsize=label_fontsize)
  ylabel("Hz", fontsize=label_fontsize)

Load the model checkpoint

In [0]:
from train import build_model
from train import restore_parts, load_checkpoint

model_speak = build_model()
model_speak = load_checkpoint(checkpoint_path, model_speak, None, True)

Automatic Image Captioning

In [0]:
% cd /content/
! git clone
In [0]:
# cp -r /content/ml-art-project3/data /content/deepvoice3_pytorch/
% cd /content/ml-art-project3/data
In [0]:
import requests
import zipfile
import io

if not os.path.exists("/content/ml-art-project3/data/Flickr8k_Dataset"):
  url = ''
  r = requests.get(url)
  z = zipfile.ZipFile(io.BytesIO(r.content))

if not os.path.exists("/content/ml-art-project3/data/Flickr8k_Dataset"):
  url = ''
  r = requests.get(url)
  z = zipfile.ZipFile(io.BytesIO(r.content))
In [0]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image
import glob
import pickle
# from pickle #import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
In [0]:
def load_doc(filename):
    file = open(filename, 'r')
    text =
    return text

def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1:
        identifier = line.split('.')[0]
    return set(dataset)

filename = '/content/ml-art-project3/data/Flickr_8k.trainImages.txt'
train = load_set(filename)

def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

def load_clean_descriptions(filename, dataset):
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
    return descriptions

train_descriptions = load_clean_descriptions('/content/ml-art-project3/data/descriptions.txt', train)

def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

max_length = max_length(train_descriptions)

Load Data

In [0]:
def preprocess(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

model = InceptionV3(weights='imagenet')
model_new = Model(model.input, model.layers[-2].output)

def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_new.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:

word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1
vocab_size = len(ixtoword) + 1     

def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
glove_dir = '/content/ml-art-project3/data/'
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

Build LSTM models

In [0]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
In [0]:
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam')
with open("/content/ml-art-project3/data/encoded_test_images.pkl", "rb") as encoded_pickle:
    encoding_test = pickle.load(encoded_pickle)


In [0]:
def Search(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

def comment(z, print_text=False):
  pic = list(encoding_test.keys())[z]
  image = encoding_test[pic].reshape((1,2048))
  images = '/content/ml-art-project3/data/Flicker8k_Dataset/'
  comment = Search(image)+'.'
  comment = comment[0].upper() + comment[1:]
  if print_text:
  tts(model_speak, comment, figures=False)
import warnings


In [0]:
In [0]:
In [0]:
In [0]:
tts(model_speak, "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?", figures=False)
In [0]:
tts(model_speak, "A skunk sat on a stump and thunk the stump stunk, but the stump thunk the skunk stunk", figures=False)
In [0]:
from google.colab import drive
In [0]:
cp /gdrive/My\ Drive/ECE\ 188\ project\ 3/glove.6B.200d.txt /content/ml-art-project3/data/