|
|
- import os
- from os.path import exists, join, basename, splitext
-
- git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'
- project_name = splitext(basename(git_repo_url))[0]
- git_repo_url2 = 'https://github.com/alokprasad/fastspeech_squeezewave.git'
- project_name2 = splitext(basename(git_repo_url2))[0]
-
- import sys
- sys.path.append(join(project_name2, "SqueezeWave/"))
- sys.path.append(project_name)
- import numpy as np
- import torch
-
- from hparams import create_hparams
- from model import Tacotron2
- from text import text_to_sequence
- from denoiser import Denoiser
- from glow import SqueezeWave
- import librosa
- import json
-
- thisdict = {}
- for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
- thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
- def ARPA(text):
- out = ''
- for word_ in text.split(" "):
- word=word_; end_chars = ''
- while any(elem in word for elem in r"!?,.;") and len(word) > 1:
- if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
- if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
- if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
- if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
- if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
- else: break
- try: word_arpa = thisdict[word.upper()]
- except: word_arpa = ''
- if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
- out = (out + " " + word + end_chars).strip()
- if out[-1] != ";": out = out + ";"
- return out
-
- #torch.set_grad_enabled(False)
-
- # initialize Tacotron2 with the pretrained model
- hparams = create_hparams()
-
- tacotron2_pretrained_model = 'tacotron.pt'
- # Setup Parameters
- hparams = create_hparams()
- hparams.sampling_rate = 22050
- hparams.max_decoder_steps = 3000 # how many steps before cutting off generation, too many and you may get CUDA errors.
- hparams.gate_threshold = 0.30 # Model must be 30% sure the clip is over before ending generation
- # Load Tacotron2 model into GPU
- model = Tacotron2(hparams)
- model.load_state_dict(torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict'])
- _ = model.eval()
- print("This Tacotron model has been trained for ",torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['iteration']," Iterations.")
-
- # Load WaveGlow model into GPU
- waveglow_pretrained_model = 'squeezewave_dict.pt'
- with open(join(project_name2, 'SqueezeWave/configs/config_a128_c256.json')) as f:
- data = f.read()
- config = json.loads(data)
- waveglow = SqueezeWave(**config['squeezewave_config'])
- waveglow.load_state_dict(torch.load(waveglow_pretrained_model), strict=False)
- waveglow = waveglow.remove_weightnorm(waveglow)
- waveglow.eval()
- for k in waveglow.convinv:
- k.float()
- denoiser = Denoiser(waveglow)
- print("SqueezeWave model loaded")
-
- import time
-
- # All right, I've been thinking. , When life gives you lemons? , Don't make lemonade. , Make life take the lemons back! , Get mad! , 'I don't want your damn lemons! What am I supposed to do with these?' , Demand to see life's manager! , Make life rue the day it thought it could give Cave Johnson lemons! , Do you know who I am? , I'm the man who's going to burn your house down! , With the lemons! , I'm going to get my engineers to invent a combustible lemon that burns your house down!
- text = """
- Peter Piper picked a peck of pickled peppers, A peck of pickled peppers Peter Piper picked; If Peter Piper picked a peck of pickled peppers, where’s the peck of pickled peppers Peter Piper picked?
- She sells sea shells by the seashore, The shells she sells are sea shells, I’m sure. So if she sells sea shells on the seashore, Then I’m sure she sells seashore shells.
- """
- sigma = 0.75
- denoise_strength = 0.01
- raw_input = False # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing
-
- counter = 0
- for i in text.split("\n"):
- start_time = time.time()
- if len(i) < 1: continue;
- print(i)
- if raw_input:
- if i[-1] != ";": i=i+";"
- else: i = ARPA(i)
- print(i)
- with torch.no_grad(): # save VRAM by not including gradients
- sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
- sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
- mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
- audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); print("");
- audio_denoised = denoiser(audio, strength=denoise_strength)[:, 0]; print("Denoised");
- # librosa.output.write_wav('Inf_' + str(counter) + '.wav', np.swapaxes(audio.cpu().numpy(),0,1), hparams.sampling_rate)
- librosa.output.write_wav('Inf_' + str(counter) + '_denoised.wav', np.swapaxes(audio_denoised.cpu().numpy(),0,1), hparams.sampling_rate)
- counter += 1
- print("--- %s seconds ---" % (time.time() - start_time))
|