import os from os.path import exists, join, basename, splitext git_repo_url = 'https://github.com/NVIDIA/tacotron2.git' project_name = splitext(basename(git_repo_url))[0] git_repo_url2 = 'https://github.com/alokprasad/fastspeech_squeezewave.git' project_name2 = splitext(basename(git_repo_url2))[0] import sys sys.path.append(join(project_name2, "SqueezeWave/")) sys.path.append(project_name) import numpy as np import torch from hparams import create_hparams from model import Tacotron2 from text import text_to_sequence from denoiser import Denoiser from glow import SqueezeWave import librosa import json thisdict = {} for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()): thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() def ARPA(text): out = '' for word_ in text.split(" "): word=word_; end_chars = '' while any(elem in word for elem in r"!?,.;") and len(word) > 1: if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1] if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1] if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1] if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1] if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1] else: break try: word_arpa = thisdict[word.upper()] except: word_arpa = '' if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}" out = (out + " " + word + end_chars).strip() if out[-1] != ";": out = out + ";" return out #torch.set_grad_enabled(False) # initialize Tacotron2 with the pretrained model hparams = create_hparams() tacotron2_pretrained_model = 'tacotron.pt' # Setup Parameters hparams = create_hparams() hparams.sampling_rate = 22050 hparams.max_decoder_steps = 3000 # how many steps before cutting off generation, too many and you may get CUDA errors. hparams.gate_threshold = 0.30 # Model must be 30% sure the clip is over before ending generation # Load Tacotron2 model into GPU model = Tacotron2(hparams) model.load_state_dict(torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict']) _ = model.eval() print("This Tacotron model has been trained for ",torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['iteration']," Iterations.") # Load WaveGlow model into GPU waveglow_pretrained_model = 'squeezewave_dict.pt' with open(join(project_name2, 'SqueezeWave/configs/config_a128_c256.json')) as f: data = f.read() config = json.loads(data) waveglow = SqueezeWave(**config['squeezewave_config']) waveglow.load_state_dict(torch.load(waveglow_pretrained_model), strict=False) waveglow = waveglow.remove_weightnorm(waveglow) waveglow.eval() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) print("SqueezeWave model loaded") import time # All right, I've been thinking. , When life gives you lemons? , Don't make lemonade. , Make life take the lemons back! , Get mad! , 'I don't want your damn lemons! What am I supposed to do with these?' , Demand to see life's manager! , Make life rue the day it thought it could give Cave Johnson lemons! , Do you know who I am? , I'm the man who's going to burn your house down! , With the lemons! , I'm going to get my engineers to invent a combustible lemon that burns your house down! text = """ Peter Piper picked a peck of pickled peppers, A peck of pickled peppers Peter Piper picked; If Peter Piper picked a peck of pickled peppers, where’s the peck of pickled peppers Peter Piper picked? She sells sea shells by the seashore, The shells she sells are sea shells, I’m sure. So if she sells sea shells on the seashore, Then I’m sure she sells seashore shells. """ sigma = 0.75 denoise_strength = 0.01 raw_input = False # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing counter = 0 for i in text.split("\n"): start_time = time.time() if len(i) < 1: continue; print(i) if raw_input: if i[-1] != ";": i=i+";" else: i = ARPA(i) print(i) with torch.no_grad(): # save VRAM by not including gradients sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); print(""); audio_denoised = denoiser(audio, strength=denoise_strength)[:, 0]; print("Denoised"); # librosa.output.write_wav('Inf_' + str(counter) + '.wav', np.swapaxes(audio.cpu().numpy(),0,1), hparams.sampling_rate) librosa.output.write_wav('Inf_' + str(counter) + '_denoised.wav', np.swapaxes(audio_denoised.cpu().numpy(),0,1), hparams.sampling_rate) counter += 1 print("--- %s seconds ---" % (time.time() - start_time))