import os
from os.path import exists, join, expanduser
# Clone
name = "deepvoice3_pytorch"
if not exists(name):
! git clone https://github.com/r9y9/$name
% cd /content/deepvoice3_pytorch
!git checkout 7a10ac6763eda92595e257543494b6a95f64229b --quiet
%pylab inline
! pip install -q librosa nltk
import torch
import numpy as np
import librosa
import librosa.display
import IPython
from IPython.display import Audio
# need this for English text processing frontend
import nltk
! python -m nltk.downloader cmudict
Grab the checkpoint and configuration from dropbox.
preset = "20180505_deepvoice3_ljspeech.json"
checkpoint_path = "20180505_deepvoice3_checkpoint_step000640000.pth"
if not exists(preset):
!curl -O -L "https://www.dropbox.com/s/0ck82unm0bo0rxd/20180505_deepvoice3_ljspeech.json"
if not exists(checkpoint_path):
!curl -O -L "https://www.dropbox.com/s/5ucl9remrwy5oeg/20180505_deepvoice3_checkpoint_step000640000.pth"
! pip install -e '.[train]'
! pip install nnmnkwii
! pip install tensorboardX
import hparams
import json
# Load parameters from preset
with open(preset) as f:
hparams.hparams.parse_json(f.read())
# Inject frontend text processor
import synthesis
import train
from deepvoice3_pytorch import frontend
synthesis._frontend = getattr(frontend, "en")
train._frontend = getattr(frontend, "en")
# alises
fs = hparams.hparams.sample_rate
hop_length = hparams.hparams.hop_size
def tts(model, text, p=0, speaker_id=None, fast=True, figures=True):
from synthesis import tts as _tts
waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
if figures:
visualize(alignment, spectrogram)
IPython.display.display(Audio(waveform, rate=fs))
def visualize(alignment, spectrogram):
label_fontsize = 16
figure(figsize=(16,16))
subplot(2,1,1)
imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
xlabel("Decoder timestamp", fontsize=label_fontsize)
ylabel("Encoder timestamp", fontsize=label_fontsize)
colorbar()
subplot(2,1,2)
librosa.display.specshow(spectrogram.T, sr=fs,
hop_length=hop_length, x_axis="time", y_axis="linear")
xlabel("Time", fontsize=label_fontsize)
ylabel("Hz", fontsize=label_fontsize)
tight_layout()
colorbar()
from train import build_model
from train import restore_parts, load_checkpoint
model_speak = build_model()
model_speak = load_checkpoint(checkpoint_path, model_speak, None, True)
% cd /content/
! git clone https://github.com/glh3025/ml-art-project3.git
# cp -r /content/ml-art-project3/data /content/deepvoice3_pytorch/
% cd /content/ml-art-project3/data
import requests
import zipfile
import io
if not os.path.exists("/content/ml-art-project3/data/Flickr8k_Dataset"):
url = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
print("Done")
if not os.path.exists("/content/ml-art-project3/data/Flickr8k_Dataset"):
url = 'http://nlp.stanford.edu/data/glove.6B.200d.txt'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
print("Done")
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image
import glob
import pickle
# from pickle #import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
def load_doc(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text
def load_set(filename):
doc = load_doc(filename)
dataset = list()
for line in doc.split('\n'):
if len(line) < 1:
continue
identifier = line.split('.')[0]
dataset.append(identifier)
return set(dataset)
filename = '/content/ml-art-project3/data/Flickr_8k.trainImages.txt'
train = load_set(filename)
def to_lines(descriptions):
all_desc = list()
for key in descriptions.keys():
[all_desc.append(d) for d in descriptions[key]]
return all_desc
def load_clean_descriptions(filename, dataset):
doc = load_doc(filename)
descriptions = dict()
for line in doc.split('\n'):
tokens = line.split()
image_id, image_desc = tokens[0], tokens[1:]
if image_id in dataset:
if image_id not in descriptions:
descriptions[image_id] = list()
desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
descriptions[image_id].append(desc)
return descriptions
train_descriptions = load_clean_descriptions('/content/ml-art-project3/data/descriptions.txt', train)
def max_length(descriptions):
lines = to_lines(descriptions)
return max(len(d.split()) for d in lines)
max_length = max_length(train_descriptions)
def preprocess(image_path):
img = image.load_img(image_path, target_size=(299, 299))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
return x
model = InceptionV3(weights='imagenet')
model_new = Model(model.input, model.layers[-2].output)
def encode(image):
image = preprocess(image) # preprocess the image
fea_vec = model_new.predict(image) # Get the encoding vector for the image
fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
return fea_vec
all_train_captions = []
for key, val in train_descriptions.items():
for cap in val:
all_train_captions.append(cap)
len(all_train_captions)
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
nsents += 1
for w in sent.split(' '):
word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocab:
wordtoix[w] = ix
ixtoword[ix] = w
ix += 1
vocab_size = len(ixtoword) + 1
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
X1, X2, y = list(), list(), list()
n=0
# loop for ever over images
while 1:
for key, desc_list in descriptions.items():
n+=1
# retrieve the photo feature
photo = photos[key+'.jpg']
for desc in desc_list:
# encode the sequence
seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
# split one sequence into multiple X, y pairs
for i in range(1, len(seq)):
# split into input and output pair
in_seq, out_seq = seq[:i], seq[i]
# pad input sequence
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
# encode output sequence
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
# store
X1.append(photo)
X2.append(in_seq)
y.append(out_seq)
# yield the batch data
if n==num_photos_per_batch:
yield [[array(X1), array(X2)], array(y)]
X1, X2, y = list(), list(), list()
n=0
glove_dir = '/content/ml-art-project3/data/'
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
embedding_dim = 200
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoix.items():
#if i < max_words:
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# Words not found in the embedding index will be all zeros
embedding_matrix[i] = embedding_vector
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.load_weights('/content/ml-art-project3/data/model_30.h5')
with open("/content/ml-art-project3/data/encoded_test_images.pkl", "rb") as encoded_pickle:
encoding_test = pickle.load(encoded_pickle)
def Search(photo):
in_text = 'startseq'
for i in range(max_length):
sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
sequence = pad_sequences([sequence], maxlen=max_length)
yhat = model.predict([photo,sequence], verbose=0)
yhat = np.argmax(yhat)
word = ixtoword[yhat]
in_text += ' ' + word
if word == 'endseq':
break
final = in_text.split()
final = final[1:-1]
final = ' '.join(final)
return final
def comment(z, print_text=False):
pic = list(encoding_test.keys())[z]
image = encoding_test[pic].reshape((1,2048))
images = '/content/ml-art-project3/data/Flicker8k_Dataset/'
x=plt.imread(images+pic)
plt.imshow(x)
plt.show()
comment = Search(image)+'.'
comment = comment[0].upper() + comment[1:]
if print_text:
print(comment)
tts(model_speak, comment, figures=False)
import warnings
warnings.filterwarnings("ignore")
comment(106)
comment(8)
comment(101)
tts(model_speak, "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?", figures=False)
tts(model_speak, "A skunk sat on a stump and thunk the stump stunk, but the stump thunk the skunk stunk", figures=False)
from google.colab import drive
drive.mount('/gdrive')
cp /gdrive/My\ Drive/ECE\ 188\ project\ 3/glove.6B.200d.txt /content/ml-art-project3/data/