dan
/
tia-tts

import osfrom os.path import exists, join, basename, splitext
git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'project_name = splitext(basename(git_repo_url))[0]git_repo_url2 = 'https://github.com/alokprasad/fastspeech_squeezewave.git'project_name2 = splitext(basename(git_repo_url2))[0]
import syssys.path.append(join(project_name2, "SqueezeWave/"))sys.path.append(project_name)import numpy as npimport torch
from hparams import create_hparamsfrom model import Tacotron2from text import text_to_sequencefrom denoiser import Denoiserfrom glow import SqueezeWaveimport librosaimport json
thisdict = {}for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):    thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()def ARPA(text):    out = ''    for word_ in text.split(" "):        word=word_; end_chars = ''        while any(elem in word for elem in r"!?,.;") and len(word) > 1:            if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]            if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]            if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]            if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]            if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]            else: break        try: word_arpa = thisdict[word.upper()]        except: word_arpa = ''        if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"        out = (out + " " + word + end_chars).strip()    if out[-1] != ";": out = out + ";"    return out
#torch.set_grad_enabled(False)
# initialize Tacotron2 with the pretrained modelhparams = create_hparams()
tacotron2_pretrained_model = 'tacotron.pt'# Setup Parametershparams = create_hparams()hparams.sampling_rate = 22050hparams.max_decoder_steps = 3000 # how many steps before cutting off generation, too many and you may get CUDA errors.hparams.gate_threshold = 0.30 # Model must be 30% sure the clip is over before ending generation# Load Tacotron2 model into GPUmodel = Tacotron2(hparams)model.load_state_dict(torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict'])_ = model.eval()print("This Tacotron model has been trained for ",torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['iteration']," Iterations.")
# Load WaveGlow model into GPUwaveglow_pretrained_model = 'squeezewave_dict.pt'with open(join(project_name2, 'SqueezeWave/configs/config_a128_c256.json')) as f:    data = f.read()config = json.loads(data)waveglow = SqueezeWave(**config['squeezewave_config'])waveglow.load_state_dict(torch.load(waveglow_pretrained_model), strict=False)waveglow = waveglow.remove_weightnorm(waveglow)waveglow.eval()for k in waveglow.convinv:    k.float()denoiser = Denoiser(waveglow)print("SqueezeWave model loaded")
import time
# All right, I've been thinking. , When life gives you lemons? , Don't make lemonade. , Make life take the lemons back! , Get mad! , 'I don't want your damn lemons! What am I supposed to do with these?' , Demand to see life's manager! , Make life rue the day it thought it could give Cave Johnson lemons! , Do you know who I am? , I'm the man who's going to burn your house down! , With the lemons! , I'm going to get my engineers to invent a combustible lemon that burns your house down!text = """
Peter Piper picked a peck of pickled peppers, A peck of pickled peppers Peter Piper picked; If Peter Piper picked a peck of pickled peppers, where’s the peck of pickled peppers Peter Piper picked?She sells sea shells by the seashore, The shells she sells are sea shells, I’m sure. So if she sells sea shells on the seashore, Then I’m sure she sells seashore shells.""" 
sigma = 0.75denoise_strength = 0.01raw_input = False # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing
counter = 0for i in text.split("\n"):    start_time = time.time()    if len(i) < 1: continue;    print(i)    if raw_input:        if i[-1] != ";": i=i+";"     else: i = ARPA(i)    print(i)    with torch.no_grad(): # save VRAM by not including gradients        sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)        audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); print("");        audio_denoised = denoiser(audio, strength=denoise_strength)[:, 0]; print("Denoised");        # librosa.output.write_wav('Inf_' + str(counter) + '.wav', np.swapaxes(audio.cpu().numpy(),0,1), hparams.sampling_rate)        librosa.output.write_wav('Inf_' + str(counter) + '_denoised.wav', np.swapaxes(audio_denoised.cpu().numpy(),0,1), hparams.sampling_rate)        counter += 1    print("--- %s seconds ---" % (time.time() - start_time))