Browse Source

Initial commit

mistress
Daniel Muckerman 3 years ago
commit
070dcc229d
9 changed files with 279650 additions and 0 deletions
  1. +3
    -0
      .gitignore
  2. +6
    -0
      .gitmodules
  3. +29
    -0
      Dockerfile
  4. +16
    -0
      README.md
  5. +1
    -0
      fastspeech_squeezewave
  6. +279478
    -0
      merged.dict_1.1.txt
  7. +10
    -0
      requirements.txt
  8. +106
    -0
      synthesize.py
  9. +1
    -0
      tacotron2

+ 3
- 0
.gitignore View File

@ -0,0 +1,3 @@
*.pt
env/
*.wav

+ 6
- 0
.gitmodules View File

@ -0,0 +1,6 @@
[submodule "fastspeech_squeezewave"]
path = fastspeech_squeezewave
url = https://git.technicalincompetence.club/dan/fastspeech_squeezewave
[submodule "tacotron2"]
path = tacotron2
url = https://git.technicalincompetence.club/dan/tacotron2

+ 29
- 0
Dockerfile View File

@ -0,0 +1,29 @@
FROM python:3.7
WORKDIR /project
RUN apt-get update && apt-get install -y git libatlas-base-dev gfortran python3-grpcio python3-scipy python3-numpy python3-sklearn python3-llvmlite python3-cffi python3-audioread python3-numba
ENV PYTHONPATH "${PYTHONPATH}":"/usr/lib/python3/dist-packages/"
RUN git clone -q --recursive https://github.com/NVIDIA/tacotron2.git
RUN git clone https://github.com/alokprasad/fastspeech_squeezewave
RUN rm -rf tacotron2/waveglow
RUN mv fastspeech_squeezewave/SqueezeWave tacotron2/waveglow
RUN rm -rf fastspeech_squeezewave
#COPY MLPTTS .
#COPY squeezewave_dict.pt .
COPY synthesize.py .
COPY merged.dict_1.1.txt .
ARG TARGETPLATFORM
ARG BUILDPLATFORM
RUN echo "I am running on $BUILDPLATFORM, building for $TARGETPLATFORM" > /log
COPY arm_wheels/tensorflow-1.14.0-cp37-none-linux_aarch64.whl .
COPY arm_wheels/torch-1.4.0a0+7f73f1d-cp37-cp37m-linux_aarch64.whl .
#RUN if [ "x$TARGETPLATFORM" = "xlinux/arm64" ] ; then pip install tensorflow-1.14.0-cp37-none-linux_aarch64.whl; rm tensorflow-1.14.0-cp37-none-linux_aarch64.whl; else pip install tensorflow==1.14.0; fi
RUN if [ "x$TARGETPLATFORM" = "xlinux/arm64" ] ; then pip install torch-1.4.0a0+7f73f1d-cp37-cp37m-linux_aarch64.whl; rm torch-1.4.0a0+7f73f1d-cp37-cp37m-linux_aarch64.whl; else pip install torch; fi
#RUN pip install tensorflow==1.14.0
#COPY tensorflow-1.14.0-cp37-none-linux_armv7l.whl .
RUN python3 -m pip install --user librosa==0.6.0 unidecode

+ 16
- 0
README.md View File

@ -0,0 +1,16 @@
# Tia TTS
Experimental TTS for CPU inference using Tacotron2 and Squeezewave.
## Install
Initialize the submodules:
`git submodule update --init --recursive`
Install the python dependencies:
`pip install -r requirements.txt`
Copy your models into the directory. This was trained on 22khz tacotron2 and squeezewave models.
Squeezewave is loaded using a state_dict so we can take advantage of the existing pretrained models provided by the paper's author while maintaining compatibility with the tweaked architecture to enable denoising without necessitating retraining the vocoder.
Run the project:
`python synthesize.py`

+ 1
- 0
fastspeech_squeezewave

@ -0,0 +1 @@
Subproject commit 839ea13617a2715e9e6f4f059fbf0dd341357de4

+ 279478
- 0
merged.dict_1.1.txt
File diff suppressed because it is too large
View File


+ 10
- 0
requirements.txt View File

@ -0,0 +1,10 @@
librosa==0.6.0
inflect
numpy
scikit-learn
scipy
six
tensorflow==1.15.0
torch==1.5.1
Unidecode==1.1.1
numba==0.48.0

+ 106
- 0
synthesize.py View File

@ -0,0 +1,106 @@
import os
from os.path import exists, join, basename, splitext
git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'
project_name = splitext(basename(git_repo_url))[0]
git_repo_url2 = 'https://github.com/alokprasad/fastspeech_squeezewave.git'
project_name2 = splitext(basename(git_repo_url2))[0]
import sys
sys.path.append(join(project_name2, "SqueezeWave/"))
sys.path.append(project_name)
import numpy as np
import torch
from hparams import create_hparams
from model import Tacotron2
from text import text_to_sequence
from denoiser import Denoiser
from glow import SqueezeWave
import librosa
import json
thisdict = {}
for line in reversed((open('merged.dict_1.1.txt', "r").read()).splitlines()):
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
def ARPA(text):
out = ''
for word_ in text.split(" "):
word=word_; end_chars = ''
while any(elem in word for elem in r"!?,.;") and len(word) > 1:
if word[-1] == '!': end_chars = '!' + end_chars; word = word[:-1]
if word[-1] == '?': end_chars = '?' + end_chars; word = word[:-1]
if word[-1] == ',': end_chars = ',' + end_chars; word = word[:-1]
if word[-1] == '.': end_chars = '.' + end_chars; word = word[:-1]
if word[-1] == ';': end_chars = ';' + end_chars; word = word[:-1]
else: break
try: word_arpa = thisdict[word.upper()]
except: word_arpa = ''
if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
out = (out + " " + word + end_chars).strip()
if out[-1] != ";": out = out + ";"
return out
#torch.set_grad_enabled(False)
# initialize Tacotron2 with the pretrained model
hparams = create_hparams()
tacotron2_pretrained_model = 'tacotron.pt'
# Setup Parameters
hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.max_decoder_steps = 3000 # how many steps before cutting off generation, too many and you may get CUDA errors.
hparams.gate_threshold = 0.30 # Model must be 30% sure the clip is over before ending generation
# Load Tacotron2 model into GPU
model = Tacotron2(hparams)
model.load_state_dict(torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict'])
_ = model.eval()
print("This Tacotron model has been trained for ",torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['iteration']," Iterations.")
# Load WaveGlow model into GPU
waveglow_pretrained_model = 'squeezewave.pt'
# squeezewave = torch.load(waveglow_pretrained_model, map_location=torch.device('cpu'))['model']
with open(join(project_name2, 'SqueezeWave/configs/config_a128_c256.json')) as f:
data = f.read()
config = json.loads(data)
waveglow = SqueezeWave(**config['squeezewave_config'])
waveglow.load_state_dict(torch.load('squeezewave_dict.pt'), strict=False)
# waveglow.load_state_dict(squeezewave.state_dict(), strict=False)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow.eval()
for k in waveglow.convinv:
k.float()
denoiser = Denoiser(waveglow)
print("SqueezeWave model loaded")
import time
# All right, I've been thinking. , When life gives you lemons? , Don't make lemonade. , Make life take the lemons back! , Get mad! , 'I don't want your damn lemons! What am I supposed to do with these?' , Demand to see life's manager! , Make life rue the day it thought it could give Cave Johnson lemons! , Do you know who I am? , I'm the man who's going to burn your house down! , With the lemons! , I'm going to get my engineers to invent a combustible lemon that burns your house down!
text = """
Peter Piper picked a peck of pickled peppers, A peck of pickled peppers Peter Piper picked; If Peter Piper picked a peck of pickled peppers, wheres the peck of pickled peppers Peter Piper picked?
She sells sea shells by the seashore, The shells she sells are sea shells, Im sure. So if she sells sea shells on the seashore, Then Im sure she sells seashore shells.
"""
sigma = 0.75
denoise_strength = 0.01
raw_input = False # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing
counter = 0
for i in text.split("\n"):
start_time = time.time()
if len(i) < 1: continue;
print(i)
if raw_input:
if i[-1] != ";": i=i+";"
else: i = ARPA(i)
print(i)
with torch.no_grad(): # save VRAM by not including gradients
sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); print("");
audio_denoised = denoiser(audio, strength=denoise_strength)[:, 0]; print("Denoised");
# librosa.output.write_wav('Inf_' + str(counter) + '.wav', np.swapaxes(audio.cpu().numpy(),0,1), hparams.sampling_rate)
librosa.output.write_wav('Inf_' + str(counter) + '_denoised.wav', np.swapaxes(audio_denoised.cpu().numpy(),0,1), hparams.sampling_rate)
counter += 1
print("--- %s seconds ---" % (time.time() - start_time))

+ 1
- 0
tacotron2

@ -0,0 +1 @@
Subproject commit 185cd24e046cc1304b4f8e564734d2498c6e2e6f

Loading…
Cancel
Save