Initial commit

4 years ago · 070dcc229d
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 *.pt
 env/
 *.wav
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,6 @@
 [submodule "fastspeech_squeezewave"]
 	path = fastspeech_squeezewave
 	url = https://git.technicalincompetence.club/dan/fastspeech_squeezewave
 [submodule "tacotron2"]
 	path = tacotron2
 	url = https://git.technicalincompetence.club/dan/tacotron2
--- a/+ 29
+++ b/+ 29
@ -0,0 +1,29 @@
 FROM python:3.7

 WORKDIR /project
 RUN apt-get update && apt-get install -y git libatlas-base-dev gfortran python3-grpcio python3-scipy python3-numpy python3-sklearn python3-llvmlite python3-cffi python3-audioread python3-numba

 ENV PYTHONPATH "${PYTHONPATH}":"/usr/lib/python3/dist-packages/"

 RUN git clone -q --recursive https://github.com/NVIDIA/tacotron2.git
 RUN git clone https://github.com/alokprasad/fastspeech_squeezewave
 RUN rm -rf tacotron2/waveglow
 RUN mv fastspeech_squeezewave/SqueezeWave tacotron2/waveglow
 RUN rm -rf fastspeech_squeezewave
 #COPY MLPTTS .
 #COPY squeezewave_dict.pt .
 COPY synthesize.py .
 COPY merged.dict_1.1.txt .

 ARG TARGETPLATFORM
 ARG BUILDPLATFORM
 RUN echo "I am running on $BUILDPLATFORM, building for $TARGETPLATFORM" > /log
 COPY arm_wheels/tensorflow-1.14.0-cp37-none-linux_aarch64.whl .
 COPY arm_wheels/torch-1.4.0a0+7f73f1d-cp37-cp37m-linux_aarch64.whl .
 #RUN if [ "x$TARGETPLATFORM" = "xlinux/arm64" ] ; then pip install tensorflow-1.14.0-cp37-none-linux_aarch64.whl; rm tensorflow-1.14.0-cp37-none-linux_aarch64.whl; else pip install tensorflow==1.14.0; fi
 RUN if [ "x$TARGETPLATFORM" = "xlinux/arm64" ] ; then pip install torch-1.4.0a0+7f73f1d-cp37-cp37m-linux_aarch64.whl; rm torch-1.4.0a0+7f73f1d-cp37-cp37m-linux_aarch64.whl; else pip install torch; fi
 #RUN pip install tensorflow==1.14.0

 #COPY tensorflow-1.14.0-cp37-none-linux_armv7l.whl .

 RUN python3 -m pip install --user librosa==0.6.0 unidecode
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
 # Tia TTS

 Experimental TTS for CPU inference using Tacotron2 and Squeezewave.

 ## Install
 Initialize the submodules:
 `git submodule update --init --recursive`

 Install the python dependencies:
 `pip install -r requirements.txt`

 Copy your models into the directory. This was trained on 22khz tacotron2 and squeezewave models.
 Squeezewave is loaded using a state_dict so we can take advantage of the existing pretrained models provided by the paper's author while maintaining compatibility with the tweaked architecture to enable denoising without necessitating retraining the vocoder.

 Run the project:
 `python synthesize.py`
--- a/+ 1
+++ b/+ 1
@ -0,0 +1 @@
 Subproject commit 839ea13617a2715e9e6f4f059fbf0dd341357de4
--- a/merged.dict_1.1.txt
+++ b/merged.dict_1.1.txt
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
 librosa==0.6.0
 inflect
 numpy
 scikit-learn
 scipy
 six
 tensorflow==1.15.0
 torch==1.5.1
 Unidecode==1.1.1
 numba==0.48.0
--- a/synthesize.py
+++ b/synthesize.py
@ -0,0 +1,106 @@
 import os
 from os.path import exists, join, basename, splitext

 git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'
 project_name = splitext(basename(git_repo_url))[0]
 git_repo_url2 = 'https://github.com/alokprasad/fastspeech_squeezewave.git'
 project_name2 = splitext(basename(git_repo_url2))[0]

 import sys
 sys.path.append(join(project_name2, "SqueezeWave/"))
 sys.path.append(project_name)
 import numpy as np
 import torch

 from hparams import create_hparams
 from model import Tacotron2
 from text import text_to_sequence
 from denoiser import Denoiser
 from glow import SqueezeWave
 import librosa
 import json

 thisdict = {}
 for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
    thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
 def ARPA(text):
    out = ''
    for word_ in text.split(" "):
        word=word_; end_chars = ''
        while any(elem in word for elem in r"!?,.;") and len(word) > 1:
            if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
            if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
            if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
            if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
            if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
            else: break
        try: word_arpa = thisdict[word.upper()]
        except: word_arpa = ''
        if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
        out = (out + " " + word + end_chars).strip()
    if out[-1] != ";": out = out + ";"
    return out

 #torch.set_grad_enabled(False)

 # initialize Tacotron2 with the pretrained model
 hparams = create_hparams()

 tacotron2_pretrained_model = 'tacotron.pt'
 # Setup Parameters
 hparams = create_hparams()
 hparams.sampling_rate = 22050
 hparams.max_decoder_steps = 3000 # how many steps before cutting off generation, too many and you may get CUDA errors.
 hparams.gate_threshold = 0.30 # Model must be 30% sure the clip is over before ending generation
 # Load Tacotron2 model into GPU
 model = Tacotron2(hparams)
 model.load_state_dict(torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict'])
 _ = model.eval()
 print("This Tacotron model has been trained for ",torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['iteration']," Iterations.")

 # Load WaveGlow model into GPU
 waveglow_pretrained_model = 'squeezewave.pt'
 # squeezewave = torch.load(waveglow_pretrained_model, map_location=torch.device('cpu'))['model']
 with open(join(project_name2, 'SqueezeWave/configs/config_a128_c256.json')) as f:
    data = f.read()
 config = json.loads(data)
 waveglow = SqueezeWave(**config['squeezewave_config'])
 waveglow.load_state_dict(torch.load('squeezewave_dict.pt'), strict=False)
 # waveglow.load_state_dict(squeezewave.state_dict(), strict=False)
 waveglow = waveglow.remove_weightnorm(waveglow)
 waveglow.eval()
 for k in waveglow.convinv:
    k.float()
 denoiser = Denoiser(waveglow)
 print("SqueezeWave model loaded")

 import time

 # All right, I've been thinking. , When life gives you lemons? , Don't make lemonade. , Make life take the lemons back! , Get mad! , 'I don't want your damn lemons! What am I supposed to do with these?' , Demand to see life's manager! , Make life rue the day it thought it could give Cave Johnson lemons! , Do you know who I am? , I'm the man who's going to burn your house down! , With the lemons! , I'm going to get my engineers to invent a combustible lemon that burns your house down!
 text = """
 Peter Piper picked a peck of pickled peppers, A peck of pickled peppers Peter Piper picked; If Peter Piper picked a peck of pickled peppers, where’s the peck of pickled peppers Peter Piper picked?
 She sells sea shells by the seashore, The shells she sells are sea shells, I’m sure. So if she sells sea shells on the seashore, Then I’m sure she sells seashore shells.
 """ 
 sigma = 0.75
 denoise_strength = 0.01
 raw_input = False # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing

 counter = 0
 for i in text.split("\n"):
    start_time = time.time()
    if len(i) < 1: continue;
    print(i)
    if raw_input:
        if i[-1] != ";": i=i+";" 
    else: i = ARPA(i)
    print(i)
    with torch.no_grad(): # save VRAM by not including gradients
        sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
        audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); print("");
        audio_denoised = denoiser(audio, strength=denoise_strength)[:, 0]; print("Denoised");
        # librosa.output.write_wav('Inf_' + str(counter) + '.wav', np.swapaxes(audio.cpu().numpy(),0,1), hparams.sampling_rate)
        librosa.output.write_wav('Inf_' + str(counter) + '_denoised.wav', np.swapaxes(audio_denoised.cpu().numpy(),0,1), hparams.sampling_rate)
        counter += 1
    print("--- %s seconds ---" % (time.time() - start_time))
--- a/+ 1
+++ b/+ 1
@ -0,0 +1 @@
 Subproject commit 185cd24e046cc1304b4f8e564734d2498c6e2e6f