You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

104 lines
4.9 KiB

import os
from os.path import exists, join, basename, splitext
git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'
project_name = splitext(basename(git_repo_url))[0]
git_repo_url2 = 'https://github.com/alokprasad/fastspeech_squeezewave.git'
project_name2 = splitext(basename(git_repo_url2))[0]
import sys
sys.path.append(join(project_name2, "SqueezeWave/"))
sys.path.append(project_name)
import numpy as np
import torch
from hparams import create_hparams
from model import Tacotron2
from text import text_to_sequence
from denoiser import Denoiser
from glow import SqueezeWave
import librosa
import json
thisdict = {}
for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
def ARPA(text):
out = ''
for word_ in text.split(" "):
word=word_; end_chars = ''
while any(elem in word for elem in r"!?,.;") and len(word) > 1:
if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
else: break
try: word_arpa = thisdict[word.upper()]
except: word_arpa = ''
if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
out = (out + " " + word + end_chars).strip()
if out[-1] != ";": out = out + ";"
return out
#torch.set_grad_enabled(False)
# initialize Tacotron2 with the pretrained model
hparams = create_hparams()
tacotron2_pretrained_model = 'tacotron.pt'
# Setup Parameters
hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.max_decoder_steps = 3000 # how many steps before cutting off generation, too many and you may get CUDA errors.
hparams.gate_threshold = 0.30 # Model must be 30% sure the clip is over before ending generation
# Load Tacotron2 model into GPU
model = Tacotron2(hparams)
model.load_state_dict(torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict'])
_ = model.eval()
print("This Tacotron model has been trained for ",torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['iteration']," Iterations.")
# Load WaveGlow model into GPU
waveglow_pretrained_model = 'squeezewave_dict.pt'
with open(join(project_name2, 'SqueezeWave/configs/config_a128_c256.json')) as f:
data = f.read()
config = json.loads(data)
waveglow = SqueezeWave(**config['squeezewave_config'])
waveglow.load_state_dict(torch.load(waveglow_pretrained_model), strict=False)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow.eval()
for k in waveglow.convinv:
k.float()
denoiser = Denoiser(waveglow)
print("SqueezeWave model loaded")
import time
# All right, I've been thinking. , When life gives you lemons? , Don't make lemonade. , Make life take the lemons back! , Get mad! , 'I don't want your damn lemons! What am I supposed to do with these?' , Demand to see life's manager! , Make life rue the day it thought it could give Cave Johnson lemons! , Do you know who I am? , I'm the man who's going to burn your house down! , With the lemons! , I'm going to get my engineers to invent a combustible lemon that burns your house down!
text = """
Peter Piper picked a peck of pickled peppers, A peck of pickled peppers Peter Piper picked; If Peter Piper picked a peck of pickled peppers, where’s the peck of pickled peppers Peter Piper picked?
She sells sea shells by the seashore, The shells she sells are sea shells, I’m sure. So if she sells sea shells on the seashore, Then I’m sure she sells seashore shells.
"""
sigma = 0.75
denoise_strength = 0.01
raw_input = False # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing
counter = 0
for i in text.split("\n"):
start_time = time.time()
if len(i) < 1: continue;
print(i)
if raw_input:
if i[-1] != ";": i=i+";"
else: i = ARPA(i)
print(i)
with torch.no_grad(): # save VRAM by not including gradients
sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); print("");
audio_denoised = denoiser(audio, strength=denoise_strength)[:, 0]; print("Denoised");
# librosa.output.write_wav('Inf_' + str(counter) + '.wav', np.swapaxes(audio.cpu().numpy(),0,1), hparams.sampling_rate)
librosa.output.write_wav('Inf_' + str(counter) + '_denoised.wav', np.swapaxes(audio_denoised.cpu().numpy(),0,1), hparams.sampling_rate)
counter += 1
print("--- %s seconds ---" % (time.time() - start_time))