diff --git a/FastSpeech/.gitignore b/FastSpeech/.gitignore
new file mode 100644
index 0000000..84f4c08
--- /dev/null
+++ b/FastSpeech/.gitignore
@@ -0,0 +1,114 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+__pycache__
+.vscode
+.DS_Store
+
+
+data/train.txt
+model_new/
+mels/
+alignments/
diff --git a/FastSpeech/LICENSE b/FastSpeech/LICENSE
new file mode 100644
index 0000000..db81f11
--- /dev/null
+++ b/FastSpeech/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Zhengxi Liu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/FastSpeech/README.md b/FastSpeech/README.md
new file mode 100644
index 0000000..f35f742
--- /dev/null
+++ b/FastSpeech/README.md
@@ -0,0 +1,68 @@
+# FastSpeech-Pytorch
+The Implementation of FastSpeech Based on Pytorch.
+
+## Update
+### 2019/10/23
+1. Fix bugs in alignment;
+2. Fix bugs in transformer;
+3. Fix bugs in LengthRegulator;
+4. Change the way to process audio;
+5. Use waveglow to synthesize.
+
+## Model
+
+
+
+
+## My Blog
+- [FastSpeech Reading Notes](https://zhuanlan.zhihu.com/p/67325775)
+- [Details and Rethinking of this Implementation](https://zhuanlan.zhihu.com/p/67939482)
+
+## Start
+### Dependencies
+- python 3.6
+- CUDA 10.0
+- pytorch==1.1.0
+- nump==1.16.2
+- scipy==1.2.1
+- librosa==0.6.3
+- inflect==2.1.0
+- matplotlib==2.2.2
+
+### Prepare Dataset
+1. Download and extract [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).
+2. Put LJSpeech dataset in `data`.
+3. Unzip `alignments.zip` \*
+4. Put [Nvidia pretrained waveglow model](https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing) in the `waveglow/pretrained_model`;
+5. Run `python preprocess.py`.
+
+*\* if you want to calculate alignment, don't unzip alignments.zip and put [Nvidia pretrained Tacotron2 model](https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing) in the `Tacotron2/pretrained_model`*
+
+## Training
+Run `python train.py`.
+
+## Test
+Run `python synthesis.py`.
+
+## Pretrained Model
+- Baidu: [Step:112000](https://pan.baidu.com/s/1by3-8t3A6uihK8K9IFZ7rg) Enter Code: xpk7
+- OneDrive: [Step:112000](https://1drv.ms/u/s!AuC2oR4FhoZ29kriYhuodY4-gPsT?e=zUIC8G)
+
+## Notes
+- In the paper of FastSpeech, authors use pre-trained Transformer-TTS to provide the target of alignment. I didn't have a well-trained Transformer-TTS model so I use Tacotron2 instead.
+- The examples of audio are in `results`.
+- The outputs and alignment of Tacotron2 are shown as follows (The sentence for synthesizing is "I want to go to CMU to do research on deep learning."):
+
+
+
+
+- The outputs of FastSpeech and Tacotron2 (Right one is tacotron2) are shown as follows (The sentence for synthesizing is "Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition."):
+
+
+
+
+## Reference
+- [The Implementation of Tacotron Based on Tensorflow](https://github.com/keithito/tacotron)
+- [The Implementation of Transformer Based on Pytorch](https://github.com/jadore801120/attention-is-all-you-need-pytorch)
+- [The Implementation of Transformer-TTS Based on Pytorch](https://github.com/xcmyz/Transformer-TTS)
+- [The Implementation of Tacotron2 Based on Pytorch](https://github.com/NVIDIA/tacotron2)
diff --git a/FastSpeech/alignments.zip b/FastSpeech/alignments.zip
new file mode 100644
index 0000000..e7c9723
Binary files /dev/null and b/FastSpeech/alignments.zip differ
diff --git a/FastSpeech/audio/__init__.py b/FastSpeech/audio/__init__.py
new file mode 100644
index 0000000..c2e5a9f
--- /dev/null
+++ b/FastSpeech/audio/__init__.py
@@ -0,0 +1,4 @@
+import audio.hparams
+import audio.tools
+import audio.stft
+import audio.audio_processing
diff --git a/FastSpeech/audio/audio_processing.py b/FastSpeech/audio/audio_processing.py
new file mode 100644
index 0000000..d0a9499
--- /dev/null
+++ b/FastSpeech/audio/audio_processing.py
@@ -0,0 +1,94 @@
+import torch
+import numpy as np
+from scipy.signal import get_window
+import librosa.util as librosa_util
+
+
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
+ n_fft=800, dtype=np.float32, norm=None):
+ """
+ # from librosa 0.6
+ Compute the sum-square envelope of a window function at a given hop length.
+
+ This is used to estimate modulation effects induced by windowing
+ observations in short-time fourier transforms.
+
+ Parameters
+ ----------
+ window : string, tuple, number, callable, or list-like
+ Window specification, as in `get_window`
+
+ n_frames : int > 0
+ The number of analysis frames
+
+ hop_length : int > 0
+ The number of samples to advance between frames
+
+ win_length : [optional]
+ The length of the window function. By default, this matches `n_fft`.
+
+ n_fft : int > 0
+ The length of each analysis frame.
+
+ dtype : np.dtype
+ The data type of the output
+
+ Returns
+ -------
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+ The sum-squared envelope of the window function
+ """
+ if win_length is None:
+ win_length = n_fft
+
+ n = n_fft + hop_length * (n_frames - 1)
+ x = np.zeros(n, dtype=dtype)
+
+ # Compute the squared window at the desired length
+ win_sq = get_window(window, win_length, fftbins=True)
+ win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
+
+ # Fill the envelope
+ for i in range(n_frames):
+ sample = i * hop_length
+ x[sample:min(n, sample + n_fft)
+ ] += win_sq[:max(0, min(n_fft, n - sample))]
+ return x
+
+
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+ """
+ PARAMS
+ ------
+ magnitudes: spectrogram magnitudes
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+ """
+
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+ angles = angles.astype(np.float32)
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+
+ for i in range(n_iters):
+ _, angles = stft_fn.transform(signal)
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+ return signal
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+ """
+ PARAMS
+ ------
+ C: compression factor
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+ """
+ PARAMS
+ ------
+ C: compression factor used to compress
+ """
+ return torch.exp(x) / C
diff --git a/FastSpeech/audio/hparams.py b/FastSpeech/audio/hparams.py
new file mode 100644
index 0000000..c27408f
--- /dev/null
+++ b/FastSpeech/audio/hparams.py
@@ -0,0 +1,8 @@
+max_wav_value = 32768.0
+sampling_rate = 22050
+filter_length = 1024
+hop_length = 256
+win_length = 1024
+n_mel_channels = 80
+mel_fmin = 0.0
+mel_fmax = 8000.0
diff --git a/FastSpeech/audio/stft.py b/FastSpeech/audio/stft.py
new file mode 100644
index 0000000..520510c
--- /dev/null
+++ b/FastSpeech/audio/stft.py
@@ -0,0 +1,158 @@
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+import numpy as np
+
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from librosa.filters import mel as librosa_mel_fn
+
+from audio.audio_processing import dynamic_range_compression
+from audio.audio_processing import dynamic_range_decompression
+from audio.audio_processing import window_sumsquare
+
+
+class STFT(torch.nn.Module):
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+
+ def __init__(self, filter_length=800, hop_length=200, win_length=800,
+ window='hann'):
+ super(STFT, self).__init__()
+ self.filter_length = filter_length
+ self.hop_length = hop_length
+ self.win_length = win_length
+ self.window = window
+ self.forward_transform = None
+ scale = self.filter_length / self.hop_length
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
+
+ cutoff = int((self.filter_length / 2 + 1))
+ fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
+ np.imag(fourier_basis[:cutoff, :])])
+
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+ inverse_basis = torch.FloatTensor(
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :])
+
+ if window is not None:
+ assert(filter_length >= win_length)
+ # get window and zero center pad it to filter_length
+ fft_window = get_window(window, win_length, fftbins=True)
+ fft_window = pad_center(fft_window, filter_length)
+ fft_window = torch.from_numpy(fft_window).float()
+
+ # window the bases
+ forward_basis *= fft_window
+ inverse_basis *= fft_window
+
+ self.register_buffer('forward_basis', forward_basis.float())
+ self.register_buffer('inverse_basis', inverse_basis.float())
+
+ def transform(self, input_data):
+ num_batches = input_data.size(0)
+ num_samples = input_data.size(1)
+
+ self.num_samples = num_samples
+
+ # similar to librosa, reflect-pad the input
+ input_data = input_data.view(num_batches, 1, num_samples)
+ input_data = F.pad(
+ input_data.unsqueeze(1),
+ (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+ mode='reflect')
+ input_data = input_data.squeeze(1)
+
+ forward_transform = F.conv1d(
+ input_data.cuda(),
+ Variable(self.forward_basis, requires_grad=False).cuda(),
+ stride=self.hop_length,
+ padding=0).cpu()
+
+ cutoff = int((self.filter_length / 2) + 1)
+ real_part = forward_transform[:, :cutoff, :]
+ imag_part = forward_transform[:, cutoff:, :]
+
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
+ phase = torch.autograd.Variable(
+ torch.atan2(imag_part.data, real_part.data))
+
+ return magnitude, phase
+
+ def inverse(self, magnitude, phase):
+ recombine_magnitude_phase = torch.cat(
+ [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
+
+ inverse_transform = F.conv_transpose1d(
+ recombine_magnitude_phase,
+ Variable(self.inverse_basis, requires_grad=False),
+ stride=self.hop_length,
+ padding=0)
+
+ if self.window is not None:
+ window_sum = window_sumsquare(
+ self.window, magnitude.size(-1), hop_length=self.hop_length,
+ win_length=self.win_length, n_fft=self.filter_length,
+ dtype=np.float32)
+ # remove modulation effects
+ approx_nonzero_indices = torch.from_numpy(
+ np.where(window_sum > tiny(window_sum))[0])
+ window_sum = torch.autograd.Variable(
+ torch.from_numpy(window_sum), requires_grad=False)
+ window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
+ inverse_transform[:, :,
+ approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+
+ # scale by hop ratio
+ inverse_transform *= float(self.filter_length) / self.hop_length
+
+ inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
+ inverse_transform = inverse_transform[:,
+ :, :-int(self.filter_length/2):]
+
+ return inverse_transform
+
+ def forward(self, input_data):
+ self.magnitude, self.phase = self.transform(input_data)
+ reconstruction = self.inverse(self.magnitude, self.phase)
+ return reconstruction
+
+
+class TacotronSTFT(torch.nn.Module):
+ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
+ n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
+ mel_fmax=8000.0):
+ super(TacotronSTFT, self).__init__()
+ self.n_mel_channels = n_mel_channels
+ self.sampling_rate = sampling_rate
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
+ mel_basis = librosa_mel_fn(
+ sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+ mel_basis = torch.from_numpy(mel_basis).float()
+ self.register_buffer('mel_basis', mel_basis)
+
+ def spectral_normalize(self, magnitudes):
+ output = dynamic_range_compression(magnitudes)
+ return output
+
+ def spectral_de_normalize(self, magnitudes):
+ output = dynamic_range_decompression(magnitudes)
+ return output
+
+ def mel_spectrogram(self, y):
+ """Computes mel-spectrograms from a batch of waves
+ PARAMS
+ ------
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+
+ RETURNS
+ -------
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+ """
+ assert(torch.min(y.data) >= -1)
+ assert(torch.max(y.data) <= 1)
+
+ magnitudes, phases = self.stft_fn.transform(y)
+ magnitudes = magnitudes.data
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
+ mel_output = self.spectral_normalize(mel_output)
+ return mel_output
diff --git a/FastSpeech/audio/tools.py b/FastSpeech/audio/tools.py
new file mode 100644
index 0000000..d0dcbd4
--- /dev/null
+++ b/FastSpeech/audio/tools.py
@@ -0,0 +1,66 @@
+import torch
+import numpy as np
+from scipy.io.wavfile import read
+from scipy.io.wavfile import write
+
+import audio.stft as stft
+import audio.hparams as hparams
+from audio.audio_processing import griffin_lim
+
+_stft = stft.TacotronSTFT(
+ hparams.filter_length, hparams.hop_length, hparams.win_length,
+ hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
+ hparams.mel_fmax)
+
+
+def load_wav_to_torch(full_path):
+ sampling_rate, data = read(full_path)
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def get_mel(filename):
+ audio, sampling_rate = load_wav_to_torch(filename)
+ if sampling_rate != _stft.sampling_rate:
+ raise ValueError("{} {} SR doesn't match target {} SR".format(
+ sampling_rate, _stft.sampling_rate))
+ audio_norm = audio / hparams.max_wav_value
+ audio_norm = audio_norm.unsqueeze(0)
+ audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+ melspec = _stft.mel_spectrogram(audio_norm)
+ melspec = torch.squeeze(melspec, 0)
+ # melspec = torch.from_numpy(_normalize(melspec.numpy()))
+
+ return melspec
+
+
+def get_mel_from_wav(audio):
+ sampling_rate = hparams.sampling_rate
+ if sampling_rate != _stft.sampling_rate:
+ raise ValueError("{} {} SR doesn't match target {} SR".format(
+ sampling_rate, _stft.sampling_rate))
+ audio_norm = audio / hparams.max_wav_value
+ audio_norm = audio_norm.unsqueeze(0)
+ audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+ melspec = _stft.mel_spectrogram(audio_norm)
+ melspec = torch.squeeze(melspec, 0)
+
+ return melspec
+
+
+def inv_mel_spec(mel, out_filename, griffin_iters=60):
+ mel = torch.stack([mel])
+ # mel = torch.stack([torch.from_numpy(_denormalize(mel.numpy()))])
+ mel_decompress = _stft.spectral_de_normalize(mel)
+ mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
+ spec_from_mel_scaling = 1000
+ spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis)
+ spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
+ spec_from_mel = spec_from_mel * spec_from_mel_scaling
+
+ audio = griffin_lim(torch.autograd.Variable(
+ spec_from_mel[:, :, :-1]), _stft.stft_fn, griffin_iters)
+
+ audio = audio.squeeze()
+ audio = audio.cpu().numpy()
+ audio_path = out_filename
+ write(audio_path, hparams.sampling_rate, audio)
diff --git a/FastSpeech/data/ljspeech.py b/FastSpeech/data/ljspeech.py
new file mode 100644
index 0000000..d6abd5d
--- /dev/null
+++ b/FastSpeech/data/ljspeech.py
@@ -0,0 +1,34 @@
+import numpy as np
+import os
+import audio as Audio
+
+
+def build_from_path(in_dir, out_dir):
+ index = 1
+ out = list()
+
+ with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
+ for line in f:
+ parts = line.strip().split('|')
+ wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
+ text = parts[2]
+ out.append(_process_utterance(out_dir, index, wav_path, text))
+
+ if index % 100 == 0:
+ print("Done %d" % index)
+ index = index + 1
+
+ return out
+
+
+def _process_utterance(out_dir, index, wav_path, text):
+ # Compute a mel-scale spectrogram from the wav:
+ mel_spectrogram = Audio.tools.get_mel(wav_path).numpy().astype(np.float32)
+ # print(mel_spectrogram)
+
+ # Write the spectrograms to disk:
+ mel_filename = 'ljspeech-mel-%05d.npy' % index
+ np.save(os.path.join(out_dir, mel_filename),
+ mel_spectrogram.T, allow_pickle=False)
+
+ return text
diff --git a/FastSpeech/dataset.py b/FastSpeech/dataset.py
new file mode 100644
index 0000000..3e2f376
--- /dev/null
+++ b/FastSpeech/dataset.py
@@ -0,0 +1,124 @@
+import torch
+from torch.nn import functional as F
+from torch.utils.data import Dataset, DataLoader
+
+import numpy as np
+import math
+import os
+
+import hparams
+import audio as Audio
+from text import text_to_sequence
+from utils import process_text, pad_1D, pad_2D
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+class FastSpeechDataset(Dataset):
+ """ LJSpeech """
+
+ def __init__(self):
+ self.text = process_text(os.path.join("data", "train.txt"))
+
+ def __len__(self):
+ return len(self.text)
+
+ def __getitem__(self, idx):
+ mel_gt_name = os.path.join(
+ hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1))
+ mel_gt_target = np.load(mel_gt_name)
+ D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy"))
+
+ character = self.text[idx][0:len(self.text[idx])-1]
+ character = np.array(text_to_sequence(
+ character, hparams.text_cleaners))
+
+ sample = {"text": character,
+ "mel_target": mel_gt_target,
+ "D": D}
+
+ return sample
+
+
+def reprocess(batch, cut_list):
+ texts = [batch[ind]["text"] for ind in cut_list]
+ mel_targets = [batch[ind]["mel_target"] for ind in cut_list]
+ Ds = [batch[ind]["D"] for ind in cut_list]
+
+ length_text = np.array([])
+ for text in texts:
+ length_text = np.append(length_text, text.shape[0])
+
+ src_pos = list()
+ max_len = int(max(length_text))
+ for length_src_row in length_text:
+ src_pos.append(np.pad([i+1 for i in range(int(length_src_row))],
+ (0, max_len-int(length_src_row)), 'constant'))
+ src_pos = np.array(src_pos)
+
+ length_mel = np.array(list())
+ for mel in mel_targets:
+ length_mel = np.append(length_mel, mel.shape[0])
+
+ mel_pos = list()
+ max_mel_len = int(max(length_mel))
+ for length_mel_row in length_mel:
+ mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))],
+ (0, max_mel_len-int(length_mel_row)), 'constant'))
+ mel_pos = np.array(mel_pos)
+
+ texts = pad_1D(texts)
+ Ds = pad_1D(Ds)
+ mel_targets = pad_2D(mel_targets)
+
+ out = {"text": texts,
+ "mel_target": mel_targets,
+ "D": Ds,
+ "mel_pos": mel_pos,
+ "src_pos": src_pos,
+ "mel_max_len": max_mel_len}
+
+ return out
+
+
+def collate_fn(batch):
+ len_arr = np.array([d["text"].shape[0] for d in batch])
+ index_arr = np.argsort(-len_arr)
+ batchsize = len(batch)
+ real_batchsize = int(math.sqrt(batchsize))
+
+ cut_list = list()
+ for i in range(real_batchsize):
+ cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize])
+
+ output = list()
+ for i in range(real_batchsize):
+ output.append(reprocess(batch, cut_list[i]))
+
+ return output
+
+
+if __name__ == "__main__":
+ # Test
+ dataset = FastSpeechDataset()
+ training_loader = DataLoader(dataset,
+ batch_size=1,
+ shuffle=False,
+ collate_fn=collate_fn,
+ drop_last=True,
+ num_workers=0)
+ total_step = hparams.epochs * len(training_loader) * hparams.batch_size
+
+ cnt = 0
+ for i, batchs in enumerate(training_loader):
+ for j, data_of_batch in enumerate(batchs):
+ mel_target = torch.from_numpy(
+ data_of_batch["mel_target"]).float().to(device)
+ D = torch.from_numpy(data_of_batch["D"]).int().to(device)
+ # print(mel_target.size())
+ # print(D.sum())
+ print(cnt)
+ if mel_target.size(1) == D.sum().item():
+ cnt += 1
+
+ print(cnt)
diff --git a/FastSpeech/fastspeech.py b/FastSpeech/fastspeech.py
new file mode 100644
index 0000000..cf14d81
--- /dev/null
+++ b/FastSpeech/fastspeech.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn as nn
+
+from transformer.Models import Encoder, Decoder
+from transformer.Layers import Linear, PostNet
+from modules import LengthRegulator
+import hparams as hp
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+class FastSpeech(nn.Module):
+ """ FastSpeech """
+
+ def __init__(self):
+ super(FastSpeech, self).__init__()
+
+ self.encoder = Encoder()
+ self.length_regulator = LengthRegulator()
+ self.decoder = Decoder()
+
+ self.mel_linear = Linear(hp.decoder_output_size, hp.num_mels)
+ self.postnet = PostNet()
+
+ def forward(self, src_seq, src_pos, mel_pos=None, mel_max_length=None, length_target=None, alpha=1.0):
+ encoder_output, _ = self.encoder(src_seq, src_pos)
+
+ if self.training:
+ length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
+ target=length_target,
+ alpha=alpha,
+ mel_max_length=mel_max_length)
+ decoder_output = self.decoder(length_regulator_output, mel_pos)
+
+ mel_output = self.mel_linear(decoder_output)
+ mel_output_postnet = self.postnet(mel_output) + mel_output
+
+ return mel_output, mel_output_postnet, duration_predictor_output
+ else:
+ length_regulator_output, decoder_pos = self.length_regulator(encoder_output,
+ alpha=alpha)
+
+ decoder_output = self.decoder(length_regulator_output, decoder_pos)
+
+ mel_output = self.mel_linear(decoder_output)
+ mel_output_postnet = self.postnet(mel_output) + mel_output
+
+ return mel_output, mel_output_postnet
+
+
+if __name__ == "__main__":
+ # Test
+ model = FastSpeech()
+ print(sum(param.numel() for param in model.parameters()))
diff --git a/FastSpeech/glow.py b/FastSpeech/glow.py
new file mode 100644
index 0000000..749160b
--- /dev/null
+++ b/FastSpeech/glow.py
@@ -0,0 +1,317 @@
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import copy
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a+input_b
+ t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+class WaveGlowLoss(torch.nn.Module):
+ def __init__(self, sigma=1.0):
+ super(WaveGlowLoss, self).__init__()
+ self.sigma = sigma
+
+ def forward(self, model_output):
+ z, log_s_list, log_det_W_list = model_output
+ for i, log_s in enumerate(log_s_list):
+ if i == 0:
+ log_s_total = torch.sum(log_s)
+ log_det_W_total = log_det_W_list[i]
+ else:
+ log_s_total = log_s_total + torch.sum(log_s)
+ log_det_W_total += log_det_W_list[i]
+
+ loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - \
+ log_s_total - log_det_W_total
+ return loss/(z.size(0)*z.size(1)*z.size(2))
+
+
+class Invertible1x1Conv(torch.nn.Module):
+ """
+ The layer outputs both the convolution, and the log determinant
+ of its weight matrix. If reverse=True it does convolution with
+ inverse
+ """
+
+ def __init__(self, c):
+ super(Invertible1x1Conv, self).__init__()
+ self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
+ bias=False)
+
+ # Sample a random orthonormal matrix to initialize weights
+ W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+
+ # Ensure determinant is 1.0 not -1.0
+ if torch.det(W) < 0:
+ W[:, 0] = -1*W[:, 0]
+ W = W.view(c, c, 1)
+ self.conv.weight.data = W
+
+ def forward(self, z, reverse=False):
+ # shape
+ batch_size, group_size, n_of_groups = z.size()
+
+ W = self.conv.weight.squeeze()
+
+ if reverse:
+ if not hasattr(self, 'W_inverse'):
+ # Reverse computation
+ W_inverse = W.inverse()
+ W_inverse = Variable(W_inverse[..., None])
+ if z.type() == 'torch.cuda.HalfTensor':
+ W_inverse = W_inverse.half()
+ self.W_inverse = W_inverse
+ z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+ return z
+ else:
+ # Forward computation
+ log_det_W = batch_size * n_of_groups * torch.logdet(W)
+ z = self.conv(z)
+ return z, log_det_W
+
+
+class WN(torch.nn.Module):
+ """
+ This is the WaveNet like layer for the affine coupling. The primary difference
+ from WaveNet is the convolutions need not be causal. There is also no dilation
+ size reset. The dilation only doubles on each layer
+ """
+
+ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+ kernel_size):
+ super(WN, self).__init__()
+ assert(kernel_size % 2 == 1)
+ assert(n_channels % 2 == 0)
+ self.n_layers = n_layers
+ self.n_channels = n_channels
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.cond_layers = torch.nn.ModuleList()
+
+ start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+ start = torch.nn.utils.weight_norm(start, name='weight')
+ self.start = start
+
+ # Initializing last layer to 0 makes the affine coupling layers
+ # do nothing at first. This helps with training stability
+ end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
+ end.weight.data.zero_()
+ end.bias.data.zero_()
+ self.end = end
+
+ for i in range(n_layers):
+ dilation = 2 ** i
+ padding = int((kernel_size*dilation - dilation)/2)
+ in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
+ dilation=dilation, padding=padding)
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+ self.in_layers.append(in_layer)
+
+ cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
+ cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+ self.cond_layers.append(cond_layer)
+
+ # last one is not necessary
+ if i < n_layers - 1:
+ res_skip_channels = 2*n_channels
+ else:
+ res_skip_channels = n_channels
+ res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(
+ res_skip_layer, name='weight')
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, forward_input):
+ audio, spect = forward_input
+ audio = self.start(audio)
+
+ for i in range(self.n_layers):
+ acts = fused_add_tanh_sigmoid_multiply(
+ self.in_layers[i](audio),
+ self.cond_layers[i](spect),
+ torch.IntTensor([self.n_channels]))
+
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ audio = res_skip_acts[:, :self.n_channels, :] + audio
+ skip_acts = res_skip_acts[:, self.n_channels:, :]
+ else:
+ skip_acts = res_skip_acts
+
+ if i == 0:
+ output = skip_acts
+ else:
+ output = skip_acts + output
+ return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+ def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
+ n_early_size, WN_config):
+ super(WaveGlow, self).__init__()
+
+ self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
+ n_mel_channels,
+ 1024, stride=256)
+ assert(n_group % 2 == 0)
+ self.n_flows = n_flows
+ self.n_group = n_group
+ self.n_early_every = n_early_every
+ self.n_early_size = n_early_size
+ self.WN = torch.nn.ModuleList()
+ self.convinv = torch.nn.ModuleList()
+
+ n_half = int(n_group/2)
+
+ # Set up layers with the right sizes based on how many dimensions
+ # have been output already
+ n_remaining_channels = n_group
+ for k in range(n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ n_half = n_half - int(self.n_early_size/2)
+ n_remaining_channels = n_remaining_channels - self.n_early_size
+ self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+ self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
+ self.n_remaining_channels = n_remaining_channels # Useful during inference
+
+ def forward(self, forward_input):
+ """
+ forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
+ forward_input[1] = audio: batch x time
+ """
+ spect, audio = forward_input
+
+ # Upsample spectrogram to size of audio
+ spect = self.upsample(spect)
+ assert(spect.size(2) >= audio.size(1))
+ if spect.size(2) > audio.size(1):
+ spect = spect[:, :, :audio.size(1)]
+
+ spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+ spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+ audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+ output_audio = []
+ log_s_list = []
+ log_det_W_list = []
+
+ for k in range(self.n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ output_audio.append(audio[:, :self.n_early_size, :])
+ audio = audio[:, self.n_early_size:, :]
+
+ audio, log_det_W = self.convinv[k](audio)
+ log_det_W_list.append(log_det_W)
+
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:, :n_half, :]
+ audio_1 = audio[:, n_half:, :]
+
+ output = self.WN[k]((audio_0, spect))
+ log_s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+ audio_1 = torch.exp(log_s)*audio_1 + b
+ log_s_list.append(log_s)
+
+ audio = torch.cat([audio_0, audio_1], 1)
+
+ output_audio.append(audio)
+ return torch.cat(output_audio, 1), log_s_list, log_det_W_list
+
+ def infer(self, spect, sigma=1.0):
+ spect = self.upsample(spect)
+ # trim conv artifacts. maybe pad spec to kernel multiple
+ time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+ spect = spect[:, :, :-time_cutoff]
+
+ spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+ spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+ if spect.type() == 'torch.cuda.HalfTensor':
+ audio = torch.cuda.HalfTensor(spect.size(0),
+ self.n_remaining_channels,
+ spect.size(2)).normal_()
+ else:
+ audio = torch.cuda.FloatTensor(spect.size(0),
+ self.n_remaining_channels,
+ spect.size(2)).normal_()
+
+ audio = torch.autograd.Variable(sigma*audio)
+
+ for k in reversed(range(self.n_flows)):
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:, :n_half, :]
+ audio_1 = audio[:, n_half:, :]
+
+ output = self.WN[k]((audio_0, spect))
+ s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+ audio_1 = (audio_1 - b)/torch.exp(s)
+ audio = torch.cat([audio_0, audio_1], 1)
+
+ audio = self.convinv[k](audio, reverse=True)
+
+ if k % self.n_early_every == 0 and k > 0:
+ if spect.type() == 'torch.cuda.HalfTensor':
+ z = torch.cuda.HalfTensor(spect.size(
+ 0), self.n_early_size, spect.size(2)).normal_()
+ else:
+ z = torch.cuda.FloatTensor(spect.size(
+ 0), self.n_early_size, spect.size(2)).normal_()
+ audio = torch.cat((sigma*z, audio), 1)
+
+ audio = audio.permute(0, 2, 1).contiguous().view(
+ audio.size(0), -1).data
+ return audio
+
+ @staticmethod
+ def remove_weightnorm(model):
+ waveglow = model
+ for WN in waveglow.WN:
+ WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+ WN.in_layers = remove(WN.in_layers)
+ WN.cond_layers = remove(WN.cond_layers)
+ WN.res_skip_layers = remove(WN.res_skip_layers)
+ return waveglow
+
+
+def remove(conv_list):
+ new_conv_list = torch.nn.ModuleList()
+ for old_conv in conv_list:
+ old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+ new_conv_list.append(old_conv)
+ return new_conv_list
diff --git a/FastSpeech/hparams.py b/FastSpeech/hparams.py
new file mode 100644
index 0000000..e4d5d5c
--- /dev/null
+++ b/FastSpeech/hparams.py
@@ -0,0 +1,52 @@
+from text import symbols
+
+# Text
+text_cleaners = ['english_cleaners']
+
+# Mel
+n_mel_channels = 80
+num_mels = 80
+
+# FastSpeech
+vocab_size = 1024
+N = 6
+Head = 2
+d_model = 384
+duration_predictor_filter_size = 256
+duration_predictor_kernel_size = 3
+dropout = 0.1
+
+word_vec_dim = 384
+encoder_n_layer = 6
+encoder_head = 2
+encoder_conv1d_filter_size = 1536
+max_sep_len = 2048
+encoder_output_size = 384
+decoder_n_layer = 6
+decoder_head = 2
+decoder_conv1d_filter_size = 1536
+decoder_output_size = 384
+fft_conv1d_kernel = 3
+fft_conv1d_padding = 1
+duration_predictor_filter_size = 256
+duration_predictor_kernel_size = 3
+dropout = 0.1
+
+# Train
+alignment_path = "./alignments"
+checkpoint_path = "./model_new"
+logger_path = "./logger"
+mel_ground_truth = "./mels"
+
+batch_size = 64
+epochs = 1000
+n_warm_up_step = 4000
+
+learning_rate = 1e-3
+weight_decay = 1e-6
+grad_clip_thresh = 1.0
+decay_step = [500000, 1000000, 2000000]
+
+save_step = 1000
+log_step = 5
+clear_Time = 20
diff --git a/FastSpeech/img/model.png b/FastSpeech/img/model.png
new file mode 100644
index 0000000..3a1f2b1
Binary files /dev/null and b/FastSpeech/img/model.png differ
diff --git a/FastSpeech/img/model_test.jpg b/FastSpeech/img/model_test.jpg
new file mode 100644
index 0000000..868916a
Binary files /dev/null and b/FastSpeech/img/model_test.jpg differ
diff --git a/FastSpeech/img/tacotron2_outputs.jpg b/FastSpeech/img/tacotron2_outputs.jpg
new file mode 100644
index 0000000..b13276c
Binary files /dev/null and b/FastSpeech/img/tacotron2_outputs.jpg differ
diff --git a/FastSpeech/loss.py b/FastSpeech/loss.py
new file mode 100644
index 0000000..ae114c8
--- /dev/null
+++ b/FastSpeech/loss.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+
+
+class FastSpeechLoss(nn.Module):
+ """ FastSPeech Loss """
+
+ def __init__(self):
+ super(FastSpeechLoss, self).__init__()
+ self.mse_loss = nn.MSELoss()
+ self.l1_loss = nn.L1Loss()
+
+ def forward(self, mel, mel_postnet, duration_predicted, mel_target, duration_predictor_target):
+ mel_target.requires_grad = False
+ mel_loss = self.mse_loss(mel, mel_target)
+ mel_postnet_loss = self.mse_loss(mel_postnet, mel_target)
+
+ duration_predictor_target.requires_grad = False
+ # duration_predictor_target = duration_predictor_target + 1
+ # duration_predictor_target = torch.log(
+ # duration_predictor_target.float())
+
+ # print(duration_predictor_target)
+ # print(duration_predicted)
+
+ duration_predictor_loss = self.l1_loss(
+ duration_predicted, duration_predictor_target.float())
+
+ return mel_loss, mel_postnet_loss, duration_predictor_loss
diff --git a/FastSpeech/modules.py b/FastSpeech/modules.py
new file mode 100644
index 0000000..fac8c37
--- /dev/null
+++ b/FastSpeech/modules.py
@@ -0,0 +1,404 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from collections import OrderedDict
+import numpy as np
+import copy
+import math
+
+import hparams as hp
+import utils
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+ ''' Sinusoid position encoding table '''
+
+ def cal_angle(position, hid_idx):
+ return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
+
+ def get_posi_angle_vec(position):
+ return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
+
+ sinusoid_table = np.array([get_posi_angle_vec(pos_i)
+ for pos_i in range(n_position)])
+
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+
+ if padding_idx is not None:
+ # zero vector for padding dimension
+ sinusoid_table[padding_idx] = 0.
+
+ return torch.FloatTensor(sinusoid_table)
+
+
+def clones(module, N):
+ return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+
+class LengthRegulator(nn.Module):
+ """ Length Regulator """
+
+ def __init__(self):
+ super(LengthRegulator, self).__init__()
+ self.duration_predictor = DurationPredictor()
+
+ def LR(self, x, duration_predictor_output, alpha=1.0, mel_max_length=None):
+ output = list()
+
+ for batch, expand_target in zip(x, duration_predictor_output):
+ output.append(self.expand(batch, expand_target, alpha))
+
+ if mel_max_length:
+ output = utils.pad(output, mel_max_length)
+ else:
+ output = utils.pad(output)
+
+ return output
+
+ def expand(self, batch, predicted, alpha):
+ out = list()
+
+ for i, vec in enumerate(batch):
+ expand_size = predicted[i].item()
+ out.append(vec.expand(int(expand_size*alpha), -1))
+ out = torch.cat(out, 0)
+
+ return out
+
+ def rounding(self, num):
+ if num - int(num) >= 0.5:
+ return int(num) + 1
+ else:
+ return int(num)
+
+ def forward(self, x, alpha=1.0, target=None, mel_max_length=None):
+ duration_predictor_output = self.duration_predictor(x)
+
+ if self.training:
+ output = self.LR(x, target, mel_max_length=mel_max_length)
+ return output, duration_predictor_output
+ else:
+ for idx, ele in enumerate(duration_predictor_output[0]):
+ duration_predictor_output[0][idx] = self.rounding(ele)
+ output = self.LR(x, duration_predictor_output, alpha)
+ mel_pos = torch.stack(
+ [torch.Tensor([i+1 for i in range(output.size(1))])]).long().to(device)
+
+ return output, mel_pos
+
+
+class DurationPredictor(nn.Module):
+ """ Duration Predictor """
+
+ def __init__(self):
+ super(DurationPredictor, self).__init__()
+
+ self.input_size = hp.d_model
+ self.filter_size = hp.duration_predictor_filter_size
+ self.kernel = hp.duration_predictor_kernel_size
+ self.conv_output_size = hp.duration_predictor_filter_size
+ self.dropout = hp.dropout
+
+ self.conv_layer = nn.Sequential(OrderedDict([
+ ("conv1d_1", Conv(self.input_size,
+ self.filter_size,
+ kernel_size=self.kernel,
+ padding=1)),
+ ("layer_norm_1", nn.LayerNorm(self.filter_size)),
+ ("relu_1", nn.ReLU()),
+ ("dropout_1", nn.Dropout(self.dropout)),
+ ("conv1d_2", Conv(self.filter_size,
+ self.filter_size,
+ kernel_size=self.kernel,
+ padding=1)),
+ ("layer_norm_2", nn.LayerNorm(self.filter_size)),
+ ("relu_2", nn.ReLU()),
+ ("dropout_2", nn.Dropout(self.dropout))
+ ]))
+
+ self.linear_layer = Linear(self.conv_output_size, 1)
+ self.relu = nn.ReLU()
+
+ def forward(self, encoder_output):
+ out = self.conv_layer(encoder_output)
+ out = self.linear_layer(out)
+
+ out = self.relu(out)
+
+ out = out.squeeze()
+
+ if not self.training:
+ out = out.unsqueeze(0)
+
+ return out
+
+
+class Conv(nn.Module):
+ """
+ Convolution Module
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ dilation=1,
+ bias=True,
+ w_init='linear'):
+ """
+ :param in_channels: dimension of input
+ :param out_channels: dimension of output
+ :param kernel_size: size of kernel
+ :param stride: size of stride
+ :param padding: size of padding
+ :param dilation: dilation rate
+ :param bias: boolean. if True, bias is included.
+ :param w_init: str. weight inits with xavier initialization.
+ """
+ super(Conv, self).__init__()
+
+ self.conv = nn.Conv1d(in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ bias=bias)
+
+ nn.init.xavier_uniform_(
+ self.conv.weight, gain=nn.init.calculate_gain(w_init))
+
+ def forward(self, x):
+ x = x.contiguous().transpose(1, 2)
+ x = self.conv(x)
+ x = x.contiguous().transpose(1, 2)
+
+ return x
+
+
+class Linear(nn.Module):
+ """
+ Linear Module
+ """
+
+ def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
+ """
+ :param in_dim: dimension of input
+ :param out_dim: dimension of output
+ :param bias: boolean. if True, bias is included.
+ :param w_init: str. weight inits with xavier initialization.
+ """
+ super(Linear, self).__init__()
+ self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
+
+ nn.init.xavier_uniform_(
+ self.linear_layer.weight,
+ gain=nn.init.calculate_gain(w_init))
+
+ def forward(self, x):
+ return self.linear_layer(x)
+
+
+class FFN(nn.Module):
+ """
+ Positionwise Feed-Forward Network
+ """
+
+ def __init__(self, num_hidden):
+ """
+ :param num_hidden: dimension of hidden
+ """
+ super(FFN, self).__init__()
+ self.w_1 = Conv(num_hidden, num_hidden * 4,
+ kernel_size=3, padding=1, w_init='relu')
+ self.w_2 = Conv(num_hidden * 4, num_hidden, kernel_size=3, padding=1)
+ self.dropout = nn.Dropout(p=0.1)
+ self.layer_norm = nn.LayerNorm(num_hidden)
+
+ def forward(self, input_):
+ # FFN Network
+ x = input_
+ x = self.w_2(torch.relu(self.w_1(x)))
+
+ # residual connection
+ x = x + input_
+
+ # dropout
+ x = self.dropout(x)
+
+ # layer normalization
+ x = self.layer_norm(x)
+
+ return x
+
+
+class MultiheadAttention(nn.Module):
+ """
+ Multihead attention mechanism (dot attention)
+ """
+
+ def __init__(self, num_hidden_k):
+ """
+ :param num_hidden_k: dimension of hidden
+ """
+ super(MultiheadAttention, self).__init__()
+
+ self.num_hidden_k = num_hidden_k
+ self.attn_dropout = nn.Dropout(p=0.1)
+
+ def forward(self, key, value, query, mask=None, query_mask=None):
+ # Get attention score
+ attn = torch.bmm(query, key.transpose(1, 2))
+ attn = attn / math.sqrt(self.num_hidden_k)
+
+ # Masking to ignore padding (key side)
+ if mask is not None:
+ attn = attn.masked_fill(mask, -2 ** 32 + 1)
+ attn = torch.softmax(attn, dim=-1)
+ else:
+ attn = torch.softmax(attn, dim=-1)
+
+ # Masking to ignore padding (query side)
+ if query_mask is not None:
+ attn = attn * query_mask
+
+ # Dropout
+ attn = self.attn_dropout(attn)
+
+ # Get Context Vector
+ result = torch.bmm(attn, value)
+
+ return result, attn
+
+
+class Attention(nn.Module):
+ """
+ Attention Network
+ """
+
+ def __init__(self, num_hidden, h=2):
+ """
+ :param num_hidden: dimension of hidden
+ :param h: num of heads
+ """
+ super(Attention, self).__init__()
+
+ self.num_hidden = num_hidden
+ self.num_hidden_per_attn = num_hidden // h
+ self.h = h
+
+ self.key = Linear(num_hidden, num_hidden, bias=False)
+ self.value = Linear(num_hidden, num_hidden, bias=False)
+ self.query = Linear(num_hidden, num_hidden, bias=False)
+
+ self.multihead = MultiheadAttention(self.num_hidden_per_attn)
+
+ self.residual_dropout = nn.Dropout(p=0.1)
+
+ self.final_linear = Linear(num_hidden * 2, num_hidden)
+
+ self.layer_norm_1 = nn.LayerNorm(num_hidden)
+
+ def forward(self, memory, decoder_input, mask=None, query_mask=None):
+
+ batch_size = memory.size(0)
+ seq_k = memory.size(1)
+ seq_q = decoder_input.size(1)
+
+ # Repeat masks h times
+ if query_mask is not None:
+ query_mask = query_mask.unsqueeze(-1).repeat(1, 1, seq_k)
+ query_mask = query_mask.repeat(self.h, 1, 1)
+ if mask is not None:
+ mask = mask.repeat(self.h, 1, 1)
+
+ # Make multihead
+ key = self.key(memory).view(batch_size,
+ seq_k,
+ self.h,
+ self.num_hidden_per_attn)
+ value = self.value(memory).view(batch_size,
+ seq_k,
+ self.h,
+ self.num_hidden_per_attn)
+ query = self.query(decoder_input).view(batch_size,
+ seq_q,
+ self.h,
+ self.num_hidden_per_attn)
+
+ key = key.permute(2, 0, 1, 3).contiguous().view(-1,
+ seq_k,
+ self.num_hidden_per_attn)
+ value = value.permute(2, 0, 1, 3).contiguous().view(-1,
+ seq_k,
+ self.num_hidden_per_attn)
+ query = query.permute(2, 0, 1, 3).contiguous().view(-1,
+ seq_q,
+ self.num_hidden_per_attn)
+
+ # Get context vector
+ result, attns = self.multihead(
+ key, value, query, mask=mask, query_mask=query_mask)
+
+ # Concatenate all multihead context vector
+ result = result.view(self.h, batch_size, seq_q,
+ self.num_hidden_per_attn)
+ result = result.permute(1, 2, 0, 3).contiguous().view(
+ batch_size, seq_q, -1)
+
+ # Concatenate context vector with input (most important)
+ result = torch.cat([decoder_input, result], dim=-1)
+
+ # Final linear
+ result = self.final_linear(result)
+
+ # Residual dropout & connection
+ result = self.residual_dropout(result)
+ result = result + decoder_input
+
+ # Layer normalization
+ result = self.layer_norm_1(result)
+
+ return result, attns
+
+
+class FFTBlock(torch.nn.Module):
+ """FFT Block"""
+
+ def __init__(self,
+ d_model,
+ n_head=hp.Head):
+ super(FFTBlock, self).__init__()
+ self.slf_attn = clones(Attention(d_model), hp.N)
+ self.pos_ffn = clones(FFN(d_model), hp.N)
+
+ self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(1024,
+ d_model,
+ padding_idx=0), freeze=True)
+
+ def forward(self, x, pos, return_attns=False):
+ # Get character mask
+ if self.training:
+ c_mask = pos.ne(0).type(torch.float)
+ mask = pos.eq(0).unsqueeze(1).repeat(1, x.size(1), 1)
+ else:
+ c_mask, mask = None, None
+
+ # Get positional embedding, apply alpha and add
+ pos = self.pos_emb(pos)
+ x = x + pos
+
+ # Attention encoder-encoder
+ attns = list()
+ for slf_attn, ffn in zip(self.slf_attn, self.pos_ffn):
+ x, attn = slf_attn(x, x, mask=mask, query_mask=c_mask)
+ x = ffn(x)
+ attns.append(attn)
+
+ return x, attns
diff --git a/FastSpeech/optimizer.py b/FastSpeech/optimizer.py
new file mode 100644
index 0000000..3da2f4f
--- /dev/null
+++ b/FastSpeech/optimizer.py
@@ -0,0 +1,44 @@
+import numpy as np
+
+
+class ScheduledOptim():
+ ''' A simple wrapper class for learning rate scheduling '''
+
+ def __init__(self, optimizer, d_model, n_warmup_steps, current_steps):
+ self._optimizer = optimizer
+ self.n_warmup_steps = n_warmup_steps
+ self.n_current_steps = current_steps
+ self.init_lr = np.power(d_model, -0.5)
+
+ def step_and_update_lr_frozen(self, learning_rate_frozen):
+ for param_group in self._optimizer.param_groups:
+ param_group['lr'] = learning_rate_frozen
+ self._optimizer.step()
+
+ def step_and_update_lr(self):
+ self._update_learning_rate()
+ self._optimizer.step()
+
+ def get_learning_rate(self):
+ learning_rate = 0.0
+ for param_group in self._optimizer.param_groups:
+ learning_rate = param_group['lr']
+
+ return learning_rate
+
+ def zero_grad(self):
+ # print(self.init_lr)
+ self._optimizer.zero_grad()
+
+ def _get_lr_scale(self):
+ return np.min([
+ np.power(self.n_current_steps, -0.5),
+ np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])
+
+ def _update_learning_rate(self):
+ ''' Learning rate scheduling per step '''
+ self.n_current_steps += 1
+ lr = self.init_lr * self._get_lr_scale()
+
+ for param_group in self._optimizer.param_groups:
+ param_group['lr'] = lr
diff --git a/FastSpeech/preprocess.py b/FastSpeech/preprocess.py
new file mode 100644
index 0000000..b7dbfc3
--- /dev/null
+++ b/FastSpeech/preprocess.py
@@ -0,0 +1,61 @@
+import torch
+import numpy as np
+import shutil
+import os
+
+from utils import load_data, get_Tacotron2, get_WaveGlow
+from utils import process_text, load_data
+from data import ljspeech
+import hparams as hp
+import waveglow
+import audio as Audio
+
+
+def preprocess_ljspeech(filename):
+ in_dir = filename
+ out_dir = hp.mel_ground_truth
+ if not os.path.exists(out_dir):
+ os.makedirs(out_dir, exist_ok=True)
+ metadata = ljspeech.build_from_path(in_dir, out_dir)
+ write_metadata(metadata, out_dir)
+
+ shutil.move(os.path.join(hp.mel_ground_truth, "train.txt"),
+ os.path.join("data", "train.txt"))
+
+
+def write_metadata(metadata, out_dir):
+ with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
+ for m in metadata:
+ f.write(m + '\n')
+
+
+def main():
+ path = os.path.join("data", "LJSpeech-1.1")
+ preprocess_ljspeech(path)
+
+ text_path = os.path.join("data", "train.txt")
+ texts = process_text(text_path)
+
+ if not os.path.exists(hp.alignment_path):
+ os.mkdir(hp.alignment_path)
+ else:
+ return
+
+ tacotron2 = get_Tacotron2()
+
+ num = 0
+ for ind, text in enumerate(texts[num:]):
+ print(ind)
+
+ character = text[0:len(text)-1]
+ mel_gt_name = os.path.join(
+ hp.mel_ground_truth, "ljspeech-mel-%05d.npy" % (ind+num+1))
+ mel_gt_target = np.load(mel_gt_name)
+ _, _, D = load_data(character, mel_gt_target, tacotron2)
+
+ np.save(os.path.join(hp.alignment_path, str(
+ ind+num) + ".npy"), D, allow_pickle=False)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/FastSpeech/results/0.wav b/FastSpeech/results/0.wav
new file mode 100644
index 0000000..636b985
Binary files /dev/null and b/FastSpeech/results/0.wav differ
diff --git a/FastSpeech/results/1.wav b/FastSpeech/results/1.wav
new file mode 100644
index 0000000..28ea831
Binary files /dev/null and b/FastSpeech/results/1.wav differ
diff --git a/FastSpeech/results/2.wav b/FastSpeech/results/2.wav
new file mode 100644
index 0000000..902933b
Binary files /dev/null and b/FastSpeech/results/2.wav differ
diff --git a/FastSpeech/synthesis.py b/FastSpeech/synthesis.py
new file mode 100644
index 0000000..809fc1b
--- /dev/null
+++ b/FastSpeech/synthesis.py
@@ -0,0 +1,74 @@
+import torch
+import torch.nn as nn
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import time
+import os
+
+from fastspeech import FastSpeech
+from text import text_to_sequence
+import hparams as hp
+import utils
+import audio as Audio
+import glow
+import waveglow
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+def get_FastSpeech(num):
+ checkpoint_path = "checkpoint_" + str(num) + ".pth.tar"
+ model = nn.DataParallel(FastSpeech()).to(device)
+ model.load_state_dict(torch.load(os.path.join(
+ hp.checkpoint_path, checkpoint_path))['model'])
+ model.eval()
+
+ return model
+
+
+def synthesis(model, text, alpha=1.0):
+ text = np.array(text_to_sequence(text, hp.text_cleaners))
+ text = np.stack([text])
+
+ src_pos = np.array([i+1 for i in range(text.shape[1])])
+ src_pos = np.stack([src_pos])
+ with torch.no_grad():
+ sequence = torch.autograd.Variable(
+ torch.from_numpy(text)).cuda().long()
+ src_pos = torch.autograd.Variable(
+ torch.from_numpy(src_pos)).cuda().long()
+
+ mel, mel_postnet = model.module.forward(sequence, src_pos, alpha=alpha)
+
+ return mel[0].cpu().transpose(0, 1), \
+ mel_postnet[0].cpu().transpose(0, 1), \
+ mel.transpose(1, 2), \
+ mel_postnet.transpose(1, 2)
+
+
+if __name__ == "__main__":
+ # Test
+ num = 112000
+ alpha = 1.0
+ model = get_FastSpeech(num)
+ words = "Let’s go out to the airport. The plane landed ten minutes ago."
+
+ mel, mel_postnet, mel_torch, mel_postnet_torch = synthesis(
+ model, words, alpha=alpha)
+
+ if not os.path.exists("results"):
+ os.mkdir("results")
+ Audio.tools.inv_mel_spec(mel_postnet, os.path.join(
+ "results", words + "_" + str(num) + "_griffin_lim.wav"))
+
+ wave_glow = utils.get_WaveGlow()
+ waveglow.inference.inference(mel_postnet_torch, wave_glow, os.path.join(
+ "results", words + "_" + str(num) + "_waveglow.wav"))
+
+ tacotron2 = utils.get_Tacotron2()
+ mel_tac2, _, _ = utils.load_data_from_tacotron2(words, tacotron2)
+ waveglow.inference.inference(torch.stack([torch.from_numpy(
+ mel_tac2).cuda()]), wave_glow, os.path.join("results", "tacotron2.wav"))
+
+ utils.plot_data([mel.numpy(), mel_postnet.numpy(), mel_tac2])
diff --git a/FastSpeech/tacotron2/__init__.py b/FastSpeech/tacotron2/__init__.py
new file mode 100644
index 0000000..8cc67d7
--- /dev/null
+++ b/FastSpeech/tacotron2/__init__.py
@@ -0,0 +1,3 @@
+import tacotron2.hparams
+import tacotron2.model
+import tacotron2.layers
diff --git a/FastSpeech/tacotron2/hparams.py b/FastSpeech/tacotron2/hparams.py
new file mode 100644
index 0000000..f4f8fb5
--- /dev/null
+++ b/FastSpeech/tacotron2/hparams.py
@@ -0,0 +1,92 @@
+from text import symbols
+
+
+class Hparams:
+ """ hyper parameters """
+
+ def __init__(self):
+ ################################
+ # Experiment Parameters #
+ ################################
+ self.epochs = 500
+ self.iters_per_checkpoint = 1000
+ self.seed = 1234
+ self.dynamic_loss_scaling = True
+ self.fp16_run = False
+ self.distributed_run = False
+ self.dist_backend = "nccl"
+ self.dist_url = "tcp://localhost:54321"
+ self.cudnn_enabled = True
+ self.cudnn_benchmark = False
+ self.ignore_layers = ['embedding.weight']
+
+ ################################
+ # Data Parameters #
+ ################################
+ self.load_mel_from_disk = False
+ self.training_files = 'filelists/ljs_audio_text_train_filelist.txt'
+ self.validation_files = 'filelists/ljs_audio_text_val_filelist.txt'
+ self.text_cleaners = ['english_cleaners']
+
+ ################################
+ # Audio Parameters #
+ ################################
+ self.max_wav_value = 32768.0
+ self.sampling_rate = 22050
+ self.filter_length = 1024
+ self.hop_length = 256
+ self.win_length = 1024
+ self.n_mel_channels = 80
+ self.mel_fmin = 0.0
+ self.mel_fmax = 8000.0
+
+ ################################
+ # Model Parameters #
+ ################################
+ self.n_symbols = len(symbols)
+ self.symbols_embedding_dim = 512
+
+ # Encoder parameters
+ self.encoder_kernel_size = 5
+ self.encoder_n_convolutions = 3
+ self.encoder_embedding_dim = 512
+
+ # Decoder parameters
+ self.n_frames_per_step = 1 # currently only 1 is supported
+ self.decoder_rnn_dim = 1024
+ self.prenet_dim = 256
+ self.max_decoder_steps = 1000
+ self.gate_threshold = 0.5
+ self.p_attention_dropout = 0.1
+ self.p_decoder_dropout = 0.1
+
+ # Attention parameters
+ self.attention_rnn_dim = 1024
+ self.attention_dim = 128
+
+ # Location Layer parameters
+ self.attention_location_n_filters = 32
+ self.attention_location_kernel_size = 31
+
+ # Mel-post processing network parameters
+ self.postnet_embedding_dim = 512
+ self.postnet_kernel_size = 5
+ self.postnet_n_convolutions = 5
+
+ ################################
+ # Optimization Hyperparameters #
+ ################################
+ self.use_saved_learning_rate = False
+ self.learning_rate = 1e-3
+ self.weight_decay = 1e-6
+ self.grad_clip_thresh = 1.0
+ self.batch_size = 64
+ self.mask_padding = True # set model's padded outputs to padded values
+
+ def return_self(self):
+ return self
+
+
+def create_hparams():
+ hparams = Hparams()
+ return hparams.return_self()
diff --git a/FastSpeech/tacotron2/layers.py b/FastSpeech/tacotron2/layers.py
new file mode 100644
index 0000000..d8bf8f8
--- /dev/null
+++ b/FastSpeech/tacotron2/layers.py
@@ -0,0 +1,36 @@
+import torch
+from librosa.filters import mel as librosa_mel_fn
+
+
+class LinearNorm(torch.nn.Module):
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+ super(LinearNorm, self).__init__()
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+
+ torch.nn.init.xavier_uniform_(
+ self.linear_layer.weight,
+ gain=torch.nn.init.calculate_gain(w_init_gain))
+
+ def forward(self, x):
+ return self.linear_layer(x)
+
+
+class ConvNorm(torch.nn.Module):
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+ padding=None, dilation=1, bias=True, w_init_gain='linear'):
+ super(ConvNorm, self).__init__()
+ if padding is None:
+ assert(kernel_size % 2 == 1)
+ padding = int(dilation * (kernel_size - 1) / 2)
+
+ self.conv = torch.nn.Conv1d(in_channels, out_channels,
+ kernel_size=kernel_size, stride=stride,
+ padding=padding, dilation=dilation,
+ bias=bias)
+
+ torch.nn.init.xavier_uniform_(
+ self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+ def forward(self, signal):
+ conv_signal = self.conv(signal)
+ return conv_signal
diff --git a/FastSpeech/tacotron2/model.py b/FastSpeech/tacotron2/model.py
new file mode 100644
index 0000000..3594c40
--- /dev/null
+++ b/FastSpeech/tacotron2/model.py
@@ -0,0 +1,533 @@
+from math import sqrt
+import torch
+from torch.autograd import Variable
+from torch import nn
+from torch.nn import functional as F
+from tacotron2.layers import ConvNorm, LinearNorm
+from tacotron2.utils import to_gpu, get_mask_from_lengths
+
+
+class LocationLayer(nn.Module):
+ def __init__(self, attention_n_filters, attention_kernel_size,
+ attention_dim):
+ super(LocationLayer, self).__init__()
+ padding = int((attention_kernel_size - 1) / 2)
+ self.location_conv = ConvNorm(2, attention_n_filters,
+ kernel_size=attention_kernel_size,
+ padding=padding, bias=False, stride=1,
+ dilation=1)
+ self.location_dense = LinearNorm(attention_n_filters, attention_dim,
+ bias=False, w_init_gain='tanh')
+
+ def forward(self, attention_weights_cat):
+ processed_attention = self.location_conv(attention_weights_cat)
+ processed_attention = processed_attention.transpose(1, 2)
+ processed_attention = self.location_dense(processed_attention)
+ return processed_attention
+
+
+class Attention(nn.Module):
+ def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
+ attention_location_n_filters, attention_location_kernel_size):
+ super(Attention, self).__init__()
+ self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
+ bias=False, w_init_gain='tanh')
+ self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
+ w_init_gain='tanh')
+ self.v = LinearNorm(attention_dim, 1, bias=False)
+ self.location_layer = LocationLayer(attention_location_n_filters,
+ attention_location_kernel_size,
+ attention_dim)
+ self.score_mask_value = -float("inf")
+
+ def get_alignment_energies(self, query, processed_memory,
+ attention_weights_cat):
+ """
+ PARAMS
+ ------
+ query: decoder output (batch, n_mel_channels * n_frames_per_step)
+ processed_memory: processed encoder outputs (B, T_in, attention_dim)
+ attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
+
+ RETURNS
+ -------
+ alignment (batch, max_time)
+ """
+
+ processed_query = self.query_layer(query.unsqueeze(1))
+ processed_attention_weights = self.location_layer(
+ attention_weights_cat)
+ energies = self.v(torch.tanh(
+ processed_query + processed_attention_weights + processed_memory))
+
+ energies = energies.squeeze(-1)
+ return energies
+
+ def forward(self, attention_hidden_state, memory, processed_memory,
+ attention_weights_cat, mask):
+ """
+ PARAMS
+ ------
+ attention_hidden_state: attention rnn last output
+ memory: encoder outputs
+ processed_memory: processed encoder outputs
+ attention_weights_cat: previous and cummulative attention weights
+ mask: binary mask for padded data
+ """
+ alignment = self.get_alignment_energies(
+ attention_hidden_state, processed_memory, attention_weights_cat)
+
+ if mask is not None:
+ alignment.data.masked_fill_(mask, self.score_mask_value)
+
+ attention_weights = F.softmax(alignment, dim=1)
+ attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+ attention_context = attention_context.squeeze(1)
+
+ return attention_context, attention_weights
+
+
+class Prenet(nn.Module):
+ def __init__(self, in_dim, sizes):
+ super(Prenet, self).__init__()
+ in_sizes = [in_dim] + sizes[:-1]
+ self.layers = nn.ModuleList(
+ [LinearNorm(in_size, out_size, bias=False)
+ for (in_size, out_size) in zip(in_sizes, sizes)])
+
+ def forward(self, x):
+ for linear in self.layers:
+ x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
+ return x
+
+
+class Postnet(nn.Module):
+ """Postnet
+ - Five 1-d convolution with 512 channels and kernel size 5
+ """
+
+ def __init__(self, hparams):
+ super(Postnet, self).__init__()
+ self.convolutions = nn.ModuleList()
+
+ self.convolutions.append(
+ nn.Sequential(
+ ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
+ kernel_size=hparams.postnet_kernel_size, stride=1,
+ padding=int((hparams.postnet_kernel_size - 1) / 2),
+ dilation=1, w_init_gain='tanh'),
+ nn.BatchNorm1d(hparams.postnet_embedding_dim))
+ )
+
+ for i in range(1, hparams.postnet_n_convolutions - 1):
+ self.convolutions.append(
+ nn.Sequential(
+ ConvNorm(hparams.postnet_embedding_dim,
+ hparams.postnet_embedding_dim,
+ kernel_size=hparams.postnet_kernel_size, stride=1,
+ padding=int(
+ (hparams.postnet_kernel_size - 1) / 2),
+ dilation=1, w_init_gain='tanh'),
+ nn.BatchNorm1d(hparams.postnet_embedding_dim))
+ )
+
+ self.convolutions.append(
+ nn.Sequential(
+ ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
+ kernel_size=hparams.postnet_kernel_size, stride=1,
+ padding=int((hparams.postnet_kernel_size - 1) / 2),
+ dilation=1, w_init_gain='linear'),
+ nn.BatchNorm1d(hparams.n_mel_channels))
+ )
+
+ def forward(self, x):
+ for i in range(len(self.convolutions) - 1):
+ x = F.dropout(torch.tanh(
+ self.convolutions[i](x)), 0.5, self.training)
+ x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
+
+ return x
+
+
+class Encoder(nn.Module):
+ """Encoder module:
+ - Three 1-d convolution banks
+ - Bidirectional LSTM
+ """
+
+ def __init__(self, hparams):
+ super(Encoder, self).__init__()
+
+ convolutions = []
+ for _ in range(hparams.encoder_n_convolutions):
+ conv_layer = nn.Sequential(
+ ConvNorm(hparams.encoder_embedding_dim,
+ hparams.encoder_embedding_dim,
+ kernel_size=hparams.encoder_kernel_size, stride=1,
+ padding=int((hparams.encoder_kernel_size - 1) / 2),
+ dilation=1, w_init_gain='relu'),
+ nn.BatchNorm1d(hparams.encoder_embedding_dim))
+ convolutions.append(conv_layer)
+ self.convolutions = nn.ModuleList(convolutions)
+
+ self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
+ int(hparams.encoder_embedding_dim / 2), 1,
+ batch_first=True, bidirectional=True)
+
+ def forward(self, x, input_lengths):
+ for conv in self.convolutions:
+ x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+
+ x = x.transpose(1, 2)
+
+ # pytorch tensor are not reversible, hence the conversion
+ input_lengths = input_lengths.cpu().numpy()
+ x = nn.utils.rnn.pack_padded_sequence(
+ x, input_lengths, batch_first=True)
+
+ self.lstm.flatten_parameters()
+ outputs, _ = self.lstm(x)
+
+ outputs, _ = nn.utils.rnn.pad_packed_sequence(
+ outputs, batch_first=True)
+
+ return outputs
+
+ def inference(self, x):
+ for conv in self.convolutions:
+ x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+
+ x = x.transpose(1, 2)
+
+ self.lstm.flatten_parameters()
+ outputs, _ = self.lstm(x)
+
+ return outputs
+
+
+class Decoder(nn.Module):
+ def __init__(self, hparams):
+ super(Decoder, self).__init__()
+ self.n_mel_channels = hparams.n_mel_channels
+ self.n_frames_per_step = hparams.n_frames_per_step
+ self.encoder_embedding_dim = hparams.encoder_embedding_dim
+ self.attention_rnn_dim = hparams.attention_rnn_dim
+ self.decoder_rnn_dim = hparams.decoder_rnn_dim
+ self.prenet_dim = hparams.prenet_dim
+ self.max_decoder_steps = hparams.max_decoder_steps
+ self.gate_threshold = hparams.gate_threshold
+ self.p_attention_dropout = hparams.p_attention_dropout
+ self.p_decoder_dropout = hparams.p_decoder_dropout
+
+ self.prenet = Prenet(
+ hparams.n_mel_channels * hparams.n_frames_per_step,
+ [hparams.prenet_dim, hparams.prenet_dim])
+
+ self.attention_rnn = nn.LSTMCell(
+ hparams.prenet_dim + hparams.encoder_embedding_dim,
+ hparams.attention_rnn_dim)
+
+ self.attention_layer = Attention(
+ hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
+ hparams.attention_dim, hparams.attention_location_n_filters,
+ hparams.attention_location_kernel_size)
+
+ self.decoder_rnn = nn.LSTMCell(
+ hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
+ hparams.decoder_rnn_dim, 1)
+
+ self.linear_projection = LinearNorm(
+ hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
+ hparams.n_mel_channels * hparams.n_frames_per_step)
+
+ self.gate_layer = LinearNorm(
+ hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
+ bias=True, w_init_gain='sigmoid')
+
+ def get_go_frame(self, memory):
+ """ Gets all zeros frames to use as first decoder input
+ PARAMS
+ ------
+ memory: decoder outputs
+
+ RETURNS
+ -------
+ decoder_input: all zeros frames
+ """
+ B = memory.size(0)
+ decoder_input = Variable(memory.data.new(
+ B, self.n_mel_channels * self.n_frames_per_step).zero_())
+ return decoder_input
+
+ def initialize_decoder_states(self, memory, mask):
+ """ Initializes attention rnn states, decoder rnn states, attention
+ weights, attention cumulative weights, attention context, stores memory
+ and stores processed memory
+ PARAMS
+ ------
+ memory: Encoder outputs
+ mask: Mask for padded data if training, expects None for inference
+ """
+ B = memory.size(0)
+ MAX_TIME = memory.size(1)
+
+ self.attention_hidden = Variable(memory.data.new(
+ B, self.attention_rnn_dim).zero_())
+ self.attention_cell = Variable(memory.data.new(
+ B, self.attention_rnn_dim).zero_())
+
+ self.decoder_hidden = Variable(memory.data.new(
+ B, self.decoder_rnn_dim).zero_())
+ self.decoder_cell = Variable(memory.data.new(
+ B, self.decoder_rnn_dim).zero_())
+
+ self.attention_weights = Variable(memory.data.new(
+ B, MAX_TIME).zero_())
+ self.attention_weights_cum = Variable(memory.data.new(
+ B, MAX_TIME).zero_())
+ self.attention_context = Variable(memory.data.new(
+ B, self.encoder_embedding_dim).zero_())
+
+ self.memory = memory
+ self.processed_memory = self.attention_layer.memory_layer(memory)
+ self.mask = mask
+
+ def parse_decoder_inputs(self, decoder_inputs):
+ """ Prepares decoder inputs, i.e. mel outputs
+ PARAMS
+ ------
+ decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
+
+ RETURNS
+ -------
+ inputs: processed decoder inputs
+
+ """
+ # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
+ decoder_inputs = decoder_inputs.transpose(1, 2)
+ decoder_inputs = decoder_inputs.view(
+ decoder_inputs.size(0),
+ int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
+ # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
+ decoder_inputs = decoder_inputs.transpose(0, 1)
+ return decoder_inputs
+
+ def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
+ """ Prepares decoder outputs for output
+ PARAMS
+ ------
+ mel_outputs:
+ gate_outputs: gate output energies
+ alignments:
+
+ RETURNS
+ -------
+ mel_outputs:
+ gate_outpust: gate output energies
+ alignments:
+ """
+ # (T_out, B) -> (B, T_out)
+ alignments = torch.stack(alignments).transpose(0, 1)
+ # (T_out, B) -> (B, T_out)
+ gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
+ gate_outputs = gate_outputs.contiguous()
+ # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
+ mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
+ # decouple frames per step
+ mel_outputs = mel_outputs.view(
+ mel_outputs.size(0), -1, self.n_mel_channels)
+ # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
+ mel_outputs = mel_outputs.transpose(1, 2)
+
+ return mel_outputs, gate_outputs, alignments
+
+ def decode(self, decoder_input):
+ """ Decoder step using stored states, attention and memory
+ PARAMS
+ ------
+ decoder_input: previous mel output
+
+ RETURNS
+ -------
+ mel_output:
+ gate_output: gate output energies
+ attention_weights:
+ """
+ cell_input = torch.cat((decoder_input, self.attention_context), -1)
+ self.attention_hidden, self.attention_cell = self.attention_rnn(
+ cell_input, (self.attention_hidden, self.attention_cell))
+ self.attention_hidden = F.dropout(
+ self.attention_hidden, self.p_attention_dropout, self.training)
+
+ attention_weights_cat = torch.cat(
+ (self.attention_weights.unsqueeze(1),
+ self.attention_weights_cum.unsqueeze(1)), dim=1)
+ self.attention_context, self.attention_weights = self.attention_layer(
+ self.attention_hidden, self.memory, self.processed_memory,
+ attention_weights_cat, self.mask)
+
+ self.attention_weights_cum += self.attention_weights
+ decoder_input = torch.cat(
+ (self.attention_hidden, self.attention_context), -1)
+ self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+ decoder_input, (self.decoder_hidden, self.decoder_cell))
+ self.decoder_hidden = F.dropout(
+ self.decoder_hidden, self.p_decoder_dropout, self.training)
+
+ decoder_hidden_attention_context = torch.cat(
+ (self.decoder_hidden, self.attention_context), dim=1)
+ decoder_output = self.linear_projection(
+ decoder_hidden_attention_context)
+
+ gate_prediction = self.gate_layer(decoder_hidden_attention_context)
+ return decoder_output, gate_prediction, self.attention_weights
+
+ def forward(self, memory, decoder_inputs, memory_lengths):
+ """ Decoder forward pass for training
+ PARAMS
+ ------
+ memory: Encoder outputs
+ decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
+ memory_lengths: Encoder output lengths for attention masking.
+
+ RETURNS
+ -------
+ mel_outputs: mel outputs from the decoder
+ gate_outputs: gate outputs from the decoder
+ alignments: sequence of attention weights from the decoder
+ """
+
+ decoder_input = self.get_go_frame(memory).unsqueeze(0)
+ decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
+ decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
+ decoder_inputs = self.prenet(decoder_inputs)
+
+ self.initialize_decoder_states(
+ memory, mask=~get_mask_from_lengths(memory_lengths))
+
+ mel_outputs, gate_outputs, alignments = [], [], []
+ while len(mel_outputs) < decoder_inputs.size(0) - 1:
+ decoder_input = decoder_inputs[len(mel_outputs)]
+ mel_output, gate_output, attention_weights = self.decode(
+ decoder_input)
+ mel_outputs += [mel_output.squeeze(1)]
+ gate_outputs += [gate_output.squeeze().unsqueeze(0)]
+ alignments += [attention_weights]
+
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
+ mel_outputs, gate_outputs, alignments)
+
+ return mel_outputs, gate_outputs, alignments
+
+ def inference(self, memory):
+ """ Decoder inference
+ PARAMS
+ ------
+ memory: Encoder outputs
+
+ RETURNS
+ -------
+ mel_outputs: mel outputs from the decoder
+ gate_outputs: gate outputs from the decoder
+ alignments: sequence of attention weights from the decoder
+ """
+ decoder_input = self.get_go_frame(memory)
+
+ self.initialize_decoder_states(memory, mask=None)
+
+ mel_outputs, gate_outputs, alignments = [], [], []
+ while True:
+ decoder_input = self.prenet(decoder_input)
+ mel_output, gate_output, alignment = self.decode(decoder_input)
+
+ mel_outputs += [mel_output.squeeze(1)]
+ gate_outputs += [gate_output]
+ alignments += [alignment]
+
+ if torch.sigmoid(gate_output.data) > self.gate_threshold:
+ break
+ elif len(mel_outputs) == self.max_decoder_steps:
+ # print("Warning! Reached max decoder steps")
+ break
+
+ decoder_input = mel_output
+
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
+ mel_outputs, gate_outputs, alignments)
+
+ return mel_outputs, gate_outputs, alignments
+
+
+class Tacotron2(nn.Module):
+ def __init__(self, hparams):
+ super(Tacotron2, self).__init__()
+ self.mask_padding = hparams.mask_padding
+ self.fp16_run = hparams.fp16_run
+ self.n_mel_channels = hparams.n_mel_channels
+ self.n_frames_per_step = hparams.n_frames_per_step
+ self.embedding = nn.Embedding(
+ hparams.n_symbols, hparams.symbols_embedding_dim)
+ std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
+ val = sqrt(3.0) * std # uniform bounds for std
+ self.embedding.weight.data.uniform_(-val, val)
+ self.encoder = Encoder(hparams)
+ self.decoder = Decoder(hparams)
+ self.postnet = Postnet(hparams)
+
+ def parse_batch(self, batch):
+ text_padded, input_lengths, mel_padded, gate_padded, \
+ output_lengths = batch
+ text_padded = to_gpu(text_padded).long()
+ input_lengths = to_gpu(input_lengths).long()
+ max_len = torch.max(input_lengths.data).item()
+ mel_padded = to_gpu(mel_padded).float()
+ gate_padded = to_gpu(gate_padded).float()
+ output_lengths = to_gpu(output_lengths).long()
+
+ return (
+ (text_padded, input_lengths, mel_padded, max_len, output_lengths),
+ (mel_padded, gate_padded))
+
+ def parse_output(self, outputs, output_lengths=None):
+ if self.mask_padding and output_lengths is not None:
+ mask = ~get_mask_from_lengths(output_lengths)
+ mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
+ mask = mask.permute(1, 0, 2)
+
+ outputs[0].data.masked_fill_(mask, 0.0)
+ outputs[1].data.masked_fill_(mask, 0.0)
+ outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
+
+ return outputs
+
+ def forward(self, inputs):
+ text_inputs, text_lengths, mels, max_len, output_lengths = inputs
+ text_lengths, output_lengths = text_lengths.data, output_lengths.data
+
+ embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
+
+ encoder_outputs = self.encoder(embedded_inputs, text_lengths)
+
+ mel_outputs, gate_outputs, alignments = self.decoder(
+ encoder_outputs, mels, memory_lengths=text_lengths)
+
+ mel_outputs_postnet = self.postnet(mel_outputs)
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+
+ return self.parse_output(
+ [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
+ output_lengths), encoder_outputs
+
+ def inference(self, inputs):
+ embedded_inputs = self.embedding(inputs).transpose(1, 2)
+ encoder_outputs = self.encoder.inference(embedded_inputs)
+ mel_outputs, gate_outputs, alignments = self.decoder.inference(
+ encoder_outputs)
+
+ mel_outputs_postnet = self.postnet(mel_outputs)
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
+
+ outputs = self.parse_output(
+ [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
+
+ return outputs, encoder_outputs
diff --git a/FastSpeech/tacotron2/utils.py b/FastSpeech/tacotron2/utils.py
new file mode 100644
index 0000000..c843d95
--- /dev/null
+++ b/FastSpeech/tacotron2/utils.py
@@ -0,0 +1,29 @@
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+
+
+def get_mask_from_lengths(lengths):
+ max_len = torch.max(lengths).item()
+ ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
+ mask = (ids < lengths.unsqueeze(1)).byte()
+ return mask
+
+
+def load_wav_to_torch(full_path):
+ sampling_rate, data = read(full_path)
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+ with open(filename, encoding='utf-8') as f:
+ filepaths_and_text = [line.strip().split(split) for line in f]
+ return filepaths_and_text
+
+
+def to_gpu(x):
+ x = x.contiguous()
+
+ if torch.cuda.is_available():
+ x = x.cuda(non_blocking=True)
+ return torch.autograd.Variable(x)
diff --git a/FastSpeech/text/__init__.py b/FastSpeech/text/__init__.py
new file mode 100644
index 0000000..eb428df
--- /dev/null
+++ b/FastSpeech/text/__init__.py
@@ -0,0 +1,75 @@
+""" from https://github.com/keithito/tacotron """
+import re
+from text import cleaners
+from text.symbols import symbols
+
+
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+
+
+def text_to_sequence(text, cleaner_names):
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+
+ The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+ in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+
+ Args:
+ text: string to convert to a sequence
+ cleaner_names: names of the cleaner functions to run the text through
+
+ Returns:
+ List of integers corresponding to the symbols in the text
+ '''
+ sequence = []
+
+ # Check for curly braces and treat their contents as ARPAbet:
+ while len(text):
+ m = _curly_re.match(text)
+ if not m:
+ sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
+ break
+ sequence += _symbols_to_sequence(
+ _clean_text(m.group(1), cleaner_names))
+ sequence += _arpabet_to_sequence(m.group(2))
+ text = m.group(3)
+
+ return sequence
+
+
+def sequence_to_text(sequence):
+ '''Converts a sequence of IDs back to a string'''
+ result = ''
+ for symbol_id in sequence:
+ if symbol_id in _id_to_symbol:
+ s = _id_to_symbol[symbol_id]
+ # Enclose ARPAbet back in curly braces:
+ if len(s) > 1 and s[0] == '@':
+ s = '{%s}' % s[1:]
+ result += s
+ return result.replace('}{', ' ')
+
+
+def _clean_text(text, cleaner_names):
+ for name in cleaner_names:
+ cleaner = getattr(cleaners, name)
+ if not cleaner:
+ raise Exception('Unknown cleaner: %s' % name)
+ text = cleaner(text)
+ return text
+
+
+def _symbols_to_sequence(symbols):
+ return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+
+
+def _arpabet_to_sequence(text):
+ return _symbols_to_sequence(['@' + s for s in text.split()])
+
+
+def _should_keep_symbol(s):
+ return s in _symbol_to_id and s is not '_' and s is not '~'
diff --git a/FastSpeech/text/cleaners.py b/FastSpeech/text/cleaners.py
new file mode 100644
index 0000000..7bd4d8d
--- /dev/null
+++ b/FastSpeech/text/cleaners.py
@@ -0,0 +1,89 @@
+""" from https://github.com/keithito/tacotron """
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+ 1. "english_cleaners" for English text
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+ the symbols in symbols.py to match your data).
+'''
+
+
+# Regular expression matching whitespace:
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+_whitespace_re = re.compile(r'\s+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+ ('mrs', 'misess'),
+ ('mr', 'mister'),
+ ('dr', 'doctor'),
+ ('st', 'saint'),
+ ('co', 'company'),
+ ('jr', 'junior'),
+ ('maj', 'major'),
+ ('gen', 'general'),
+ ('drs', 'doctors'),
+ ('rev', 'reverend'),
+ ('lt', 'lieutenant'),
+ ('hon', 'honorable'),
+ ('sgt', 'sergeant'),
+ ('capt', 'captain'),
+ ('esq', 'esquire'),
+ ('ltd', 'limited'),
+ ('col', 'colonel'),
+ ('ft', 'fort'),
+]]
+
+
+def expand_abbreviations(text):
+ for regex, replacement in _abbreviations:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def expand_numbers(text):
+ return normalize_numbers(text)
+
+
+def lowercase(text):
+ return text.lower()
+
+
+def collapse_whitespace(text):
+ return re.sub(_whitespace_re, ' ', text)
+
+
+def convert_to_ascii(text):
+ return unidecode(text)
+
+
+def basic_cleaners(text):
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+ text = lowercase(text)
+ text = collapse_whitespace(text)
+ return text
+
+
+def transliteration_cleaners(text):
+ '''Pipeline for non-English text that transliterates to ASCII.'''
+ text = convert_to_ascii(text)
+ text = lowercase(text)
+ text = collapse_whitespace(text)
+ return text
+
+
+def english_cleaners(text):
+ '''Pipeline for English text, including number and abbreviation expansion.'''
+ text = convert_to_ascii(text)
+ text = lowercase(text)
+ text = expand_numbers(text)
+ text = expand_abbreviations(text)
+ text = collapse_whitespace(text)
+ return text
diff --git a/FastSpeech/text/cmudict.py b/FastSpeech/text/cmudict.py
new file mode 100644
index 0000000..b45a53a
--- /dev/null
+++ b/FastSpeech/text/cmudict.py
@@ -0,0 +1,64 @@
+""" from https://github.com/keithito/tacotron """
+
+import re
+
+
+valid_symbols = [
+ 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+ 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+ 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+ 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+ 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+ 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+]
+
+_valid_symbol_set = set(valid_symbols)
+
+
+class CMUDict:
+ '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
+
+ def __init__(self, file_or_path, keep_ambiguous=True):
+ if isinstance(file_or_path, str):
+ with open(file_or_path, encoding='latin-1') as f:
+ entries = _parse_cmudict(f)
+ else:
+ entries = _parse_cmudict(file_or_path)
+ if not keep_ambiguous:
+ entries = {word: pron for word,
+ pron in entries.items() if len(pron) == 1}
+ self._entries = entries
+
+ def __len__(self):
+ return len(self._entries)
+
+ def lookup(self, word):
+ '''Returns list of ARPAbet pronunciations of the given word.'''
+ return self._entries.get(word.upper())
+
+
+_alt_re = re.compile(r'\([0-9]+\)')
+
+
+def _parse_cmudict(file):
+ cmudict = {}
+ for line in file:
+ if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
+ parts = line.split(' ')
+ word = re.sub(_alt_re, '', parts[0])
+ pronunciation = _get_pronunciation(parts[1])
+ if pronunciation:
+ if word in cmudict:
+ cmudict[word].append(pronunciation)
+ else:
+ cmudict[word] = [pronunciation]
+ return cmudict
+
+
+def _get_pronunciation(s):
+ parts = s.strip().split(' ')
+ for part in parts:
+ if part not in _valid_symbol_set:
+ return None
+ return ' '.join(parts)
diff --git a/FastSpeech/text/numbers.py b/FastSpeech/text/numbers.py
new file mode 100644
index 0000000..f3e4966
--- /dev/null
+++ b/FastSpeech/text/numbers.py
@@ -0,0 +1,71 @@
+""" from https://github.com/keithito/tacotron """
+
+import inflect
+import re
+
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+
+
+def _remove_commas(m):
+ return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+ return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+ match = m.group(1)
+ parts = match.split('.')
+ if len(parts) > 2:
+ return match + ' dollars' # Unexpected format
+ dollars = int(parts[0]) if parts[0] else 0
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+ if dollars and cents:
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+ cent_unit = 'cent' if cents == 1 else 'cents'
+ return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+ elif dollars:
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+ return '%s %s' % (dollars, dollar_unit)
+ elif cents:
+ cent_unit = 'cent' if cents == 1 else 'cents'
+ return '%s %s' % (cents, cent_unit)
+ else:
+ return 'zero dollars'
+
+
+def _expand_ordinal(m):
+ return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+ num = int(m.group(0))
+ if num > 1000 and num < 3000:
+ if num == 2000:
+ return 'two thousand'
+ elif num > 2000 and num < 2010:
+ return 'two thousand ' + _inflect.number_to_words(num % 100)
+ elif num % 100 == 0:
+ return _inflect.number_to_words(num // 100) + ' hundred'
+ else:
+ return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+ else:
+ return _inflect.number_to_words(num, andword='')
+
+
+def normalize_numbers(text):
+ text = re.sub(_comma_number_re, _remove_commas, text)
+ text = re.sub(_pounds_re, r'\1 pounds', text)
+ text = re.sub(_dollars_re, _expand_dollars, text)
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
+ text = re.sub(_number_re, _expand_number, text)
+ return text
diff --git a/FastSpeech/text/symbols.py b/FastSpeech/text/symbols.py
new file mode 100644
index 0000000..0a7d6d6
--- /dev/null
+++ b/FastSpeech/text/symbols.py
@@ -0,0 +1,19 @@
+""" from https://github.com/keithito/tacotron """
+
+'''
+Defines the set of symbols used in text input to the model.
+
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
+
+from text import cmudict
+_pad = '_'
+_punctuation = '!\'(),.:;? '
+_special = '-'
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ['@' + s for s in cmudict.valid_symbols]
+
+# Export all symbols:
+symbols = [_pad] + list(_special) + list(_punctuation) + \
+ list(_letters) + _arpabet
diff --git a/FastSpeech/train.py b/FastSpeech/train.py
new file mode 100644
index 0000000..d4307c5
--- /dev/null
+++ b/FastSpeech/train.py
@@ -0,0 +1,194 @@
+import torch
+import torch.nn as nn
+
+from multiprocessing import cpu_count
+import numpy as np
+import argparse
+import os
+import time
+import math
+
+from fastspeech import FastSpeech
+from loss import FastSpeechLoss
+from dataset import FastSpeechDataset, collate_fn, DataLoader
+from optimizer import ScheduledOptim
+import hparams as hp
+import utils
+
+
+def main(args):
+ # Get device
+ device = torch.device('cuda'if torch.cuda.is_available()else 'cpu')
+
+ # Define model
+ model = nn.DataParallel(FastSpeech()).to(device)
+ print("Model Has Been Defined")
+ num_param = utils.get_param_num(model)
+ print('Number of FastSpeech Parameters:', num_param)
+
+ # Get dataset
+ dataset = FastSpeechDataset()
+
+ # Optimizer and loss
+ optimizer = torch.optim.Adam(
+ model.parameters(), betas=(0.9, 0.98), eps=1e-9)
+ scheduled_optim = ScheduledOptim(optimizer,
+ hp.d_model,
+ hp.n_warm_up_step,
+ args.restore_step)
+ fastspeech_loss = FastSpeechLoss().to(device)
+ print("Defined Optimizer and Loss Function.")
+
+ # Load checkpoint if exists
+ try:
+ checkpoint = torch.load(os.path.join(
+ hp.checkpoint_path, 'checkpoint_%d.pth.tar' % args.restore_step))
+ model.load_state_dict(checkpoint['model'])
+ optimizer.load_state_dict(checkpoint['optimizer'])
+ print("\n---Model Restored at Step %d---\n" % args.restore_step)
+ except:
+ print("\n---Start New Training---\n")
+ if not os.path.exists(hp.checkpoint_path):
+ os.mkdir(hp.checkpoint_path)
+
+ # Init logger
+ if not os.path.exists(hp.logger_path):
+ os.mkdir(hp.logger_path)
+
+ # Define Some Information
+ Time = np.array([])
+ Start = time.clock()
+
+ # Training
+ model = model.train()
+
+ for epoch in range(hp.epochs):
+ # Get Training Loader
+ training_loader = DataLoader(dataset,
+ batch_size=hp.batch_size**2,
+ shuffle=True,
+ collate_fn=collate_fn,
+ drop_last=True,
+ num_workers=0)
+ total_step = hp.epochs * len(training_loader) * hp.batch_size
+
+ for i, batchs in enumerate(training_loader):
+ for j, data_of_batch in enumerate(batchs):
+ start_time = time.clock()
+
+ current_step = i * hp.batch_size + j + args.restore_step + \
+ epoch * len(training_loader)*hp.batch_size + 1
+
+ # Init
+ scheduled_optim.zero_grad()
+
+ # Get Data
+ character = torch.from_numpy(
+ data_of_batch["text"]).long().to(device)
+ mel_target = torch.from_numpy(
+ data_of_batch["mel_target"]).float().to(device)
+ D = torch.from_numpy(data_of_batch["D"]).int().to(device)
+ mel_pos = torch.from_numpy(
+ data_of_batch["mel_pos"]).long().to(device)
+ src_pos = torch.from_numpy(
+ data_of_batch["src_pos"]).long().to(device)
+ max_mel_len = data_of_batch["mel_max_len"]
+
+ # Forward
+ mel_output, mel_postnet_output, duration_predictor_output = model(character,
+ src_pos,
+ mel_pos=mel_pos,
+ mel_max_length=max_mel_len,
+ length_target=D)
+
+ # print(mel_target.size())
+ # print(mel_output.size())
+
+ # Cal Loss
+ mel_loss, mel_postnet_loss, duration_loss = fastspeech_loss(mel_output,
+ mel_postnet_output,
+ duration_predictor_output,
+ mel_target,
+ D)
+ total_loss = mel_loss + mel_postnet_loss + duration_loss
+
+ # Logger
+ t_l = total_loss.item()
+ m_l = mel_loss.item()
+ m_p_l = mel_postnet_loss.item()
+ d_l = duration_loss.item()
+
+ with open(os.path.join("logger", "total_loss.txt"), "a") as f_total_loss:
+ f_total_loss.write(str(t_l)+"\n")
+
+ with open(os.path.join("logger", "mel_loss.txt"), "a") as f_mel_loss:
+ f_mel_loss.write(str(m_l)+"\n")
+
+ with open(os.path.join("logger", "mel_postnet_loss.txt"), "a") as f_mel_postnet_loss:
+ f_mel_postnet_loss.write(str(m_p_l)+"\n")
+
+ with open(os.path.join("logger", "duration_loss.txt"), "a") as f_d_loss:
+ f_d_loss.write(str(d_l)+"\n")
+
+ # Backward
+ total_loss.backward()
+
+ # Clipping gradients to avoid gradient explosion
+ nn.utils.clip_grad_norm_(
+ model.parameters(), hp.grad_clip_thresh)
+
+ # Update weights
+ if args.frozen_learning_rate:
+ scheduled_optim.step_and_update_lr_frozen(
+ args.learning_rate_frozen)
+ else:
+ scheduled_optim.step_and_update_lr()
+
+ # Print
+ if current_step % hp.log_step == 0:
+ Now = time.clock()
+
+ str1 = "Epoch [{}/{}], Step [{}/{}]:".format(
+ epoch+1, hp.epochs, current_step, total_step)
+ str2 = "Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f}, Duration Loss: {:.4f};".format(
+ m_l, m_p_l, d_l)
+ str3 = "Current Learning Rate is {:.6f}.".format(
+ scheduled_optim.get_learning_rate())
+ str4 = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format(
+ (Now-Start), (total_step-current_step)*np.mean(Time))
+
+ print("\n" + str1)
+ print(str2)
+ print(str3)
+ print(str4)
+
+ with open(os.path.join("logger", "logger.txt"), "a") as f_logger:
+ f_logger.write(str1 + "\n")
+ f_logger.write(str2 + "\n")
+ f_logger.write(str3 + "\n")
+ f_logger.write(str4 + "\n")
+ f_logger.write("\n")
+
+ if current_step % hp.save_step == 0:
+ torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict(
+ )}, os.path.join(hp.checkpoint_path, 'checkpoint_%d.pth.tar' % current_step))
+ print("save model at step %d ..." % current_step)
+
+ end_time = time.clock()
+ Time = np.append(Time, end_time - start_time)
+ if len(Time) == hp.clear_Time:
+ temp_value = np.mean(Time)
+ Time = np.delete(
+ Time, [i for i in range(len(Time))], axis=None)
+ Time = np.append(Time, temp_value)
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--restore_step', type=int, default=0)
+ parser.add_argument('--frozen_learning_rate', type=bool, default=False)
+ parser.add_argument("--learning_rate_frozen", type=float, default=1e-3)
+ args = parser.parse_args()
+
+ main(args)
diff --git a/FastSpeech/transformer/Beam.py b/FastSpeech/transformer/Beam.py
new file mode 100644
index 0000000..ddad45d
--- /dev/null
+++ b/FastSpeech/transformer/Beam.py
@@ -0,0 +1,100 @@
+import torch
+import numpy as np
+import transformer.Constants as Constants
+
+
+class Beam():
+ ''' Beam search '''
+
+ def __init__(self, size, device=False):
+
+ self.size = size
+ self._done = False
+
+ # The score for each translation on the beam.
+ self.scores = torch.zeros((size,), dtype=torch.float, device=device)
+ self.all_scores = []
+
+ # The backpointers at each time-step.
+ self.prev_ks = []
+
+ # The outputs at each time-step.
+ self.next_ys = [torch.full(
+ (size,), Constants.PAD, dtype=torch.long, device=device)]
+ self.next_ys[0][0] = Constants.BOS
+
+ def get_current_state(self):
+ "Get the outputs for the current timestep."
+ return self.get_tentative_hypothesis()
+
+ def get_current_origin(self):
+ "Get the backpointers for the current timestep."
+ return self.prev_ks[-1]
+
+ @property
+ def done(self):
+ return self._done
+
+ def advance(self, word_prob):
+ "Update beam status and check if finished or not."
+ num_words = word_prob.size(1)
+
+ # Sum the previous scores.
+ if len(self.prev_ks) > 0:
+ beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
+ else:
+ beam_lk = word_prob[0]
+
+ flat_beam_lk = beam_lk.view(-1)
+
+ best_scores, best_scores_id = flat_beam_lk.topk(
+ self.size, 0, True, True) # 1st sort
+ best_scores, best_scores_id = flat_beam_lk.topk(
+ self.size, 0, True, True) # 2nd sort
+
+ self.all_scores.append(self.scores)
+ self.scores = best_scores
+
+ # bestScoresId is flattened as a (beam x word) array,
+ # so we need to calculate which word and beam each score came from
+ prev_k = best_scores_id / num_words
+ self.prev_ks.append(prev_k)
+ self.next_ys.append(best_scores_id - prev_k * num_words)
+
+ # End condition is when top-of-beam is EOS.
+ if self.next_ys[-1][0].item() == Constants.EOS:
+ self._done = True
+ self.all_scores.append(self.scores)
+
+ return self._done
+
+ def sort_scores(self):
+ "Sort the scores."
+ return torch.sort(self.scores, 0, True)
+
+ def get_the_best_score_and_idx(self):
+ "Get the score of the best in the beam."
+ scores, ids = self.sort_scores()
+ return scores[1], ids[1]
+
+ def get_tentative_hypothesis(self):
+ "Get the decoded sequence for the current timestep."
+
+ if len(self.next_ys) == 1:
+ dec_seq = self.next_ys[0].unsqueeze(1)
+ else:
+ _, keys = self.sort_scores()
+ hyps = [self.get_hypothesis(k) for k in keys]
+ hyps = [[Constants.BOS] + h for h in hyps]
+ dec_seq = torch.LongTensor(hyps)
+
+ return dec_seq
+
+ def get_hypothesis(self, k):
+ """ Walk back to construct the full hypothesis. """
+ hyp = []
+ for j in range(len(self.prev_ks) - 1, -1, -1):
+ hyp.append(self.next_ys[j+1][k])
+ k = self.prev_ks[j][k]
+
+ return list(map(lambda x: x.item(), hyp[::-1]))
diff --git a/FastSpeech/transformer/Constants.py b/FastSpeech/transformer/Constants.py
new file mode 100644
index 0000000..e524491
--- /dev/null
+++ b/FastSpeech/transformer/Constants.py
@@ -0,0 +1,9 @@
+PAD = 0
+UNK = 1
+BOS = 2
+EOS = 3
+
+PAD_WORD = ''
+UNK_WORD = ''
+BOS_WORD = ''
+EOS_WORD = ''
diff --git a/FastSpeech/transformer/Layers.py b/FastSpeech/transformer/Layers.py
new file mode 100644
index 0000000..a2db196
--- /dev/null
+++ b/FastSpeech/transformer/Layers.py
@@ -0,0 +1,230 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import numpy as np
+from collections import OrderedDict
+
+from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
+from text.symbols import symbols
+
+
+class Linear(nn.Module):
+ """
+ Linear Module
+ """
+
+ def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
+ """
+ :param in_dim: dimension of input
+ :param out_dim: dimension of output
+ :param bias: boolean. if True, bias is included.
+ :param w_init: str. weight inits with xavier initialization.
+ """
+ super(Linear, self).__init__()
+ self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
+
+ nn.init.xavier_uniform_(
+ self.linear_layer.weight,
+ gain=nn.init.calculate_gain(w_init))
+
+ def forward(self, x):
+ return self.linear_layer(x)
+
+
+class PreNet(nn.Module):
+ """
+ Pre Net before passing through the network
+ """
+
+ def __init__(self, input_size, hidden_size, output_size, p=0.5):
+ """
+ :param input_size: dimension of input
+ :param hidden_size: dimension of hidden unit
+ :param output_size: dimension of output
+ """
+ super(PreNet, self).__init__()
+ self.input_size = input_size
+ self.output_size = output_size
+ self.hidden_size = hidden_size
+ self.layer = nn.Sequential(OrderedDict([
+ ('fc1', Linear(self.input_size, self.hidden_size)),
+ ('relu1', nn.ReLU()),
+ ('dropout1', nn.Dropout(p)),
+ ('fc2', Linear(self.hidden_size, self.output_size)),
+ ('relu2', nn.ReLU()),
+ ('dropout2', nn.Dropout(p)),
+ ]))
+
+ def forward(self, input_):
+
+ out = self.layer(input_)
+
+ return out
+
+
+class Conv(nn.Module):
+ """
+ Convolution Module
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ dilation=1,
+ bias=True,
+ w_init='linear'):
+ """
+ :param in_channels: dimension of input
+ :param out_channels: dimension of output
+ :param kernel_size: size of kernel
+ :param stride: size of stride
+ :param padding: size of padding
+ :param dilation: dilation rate
+ :param bias: boolean. if True, bias is included.
+ :param w_init: str. weight inits with xavier initialization.
+ """
+ super(Conv, self).__init__()
+
+ self.conv = nn.Conv1d(in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ bias=bias)
+
+ nn.init.xavier_uniform_(
+ self.conv.weight, gain=nn.init.calculate_gain(w_init))
+
+ def forward(self, x):
+ x = self.conv(x)
+ return x
+
+
+class FFTBlock(torch.nn.Module):
+ """FFT Block"""
+
+ def __init__(self,
+ d_model,
+ d_inner,
+ n_head,
+ d_k,
+ d_v,
+ dropout=0.1):
+ super(FFTBlock, self).__init__()
+ self.slf_attn = MultiHeadAttention(
+ n_head, d_model, d_k, d_v, dropout=dropout)
+ self.pos_ffn = PositionwiseFeedForward(
+ d_model, d_inner, dropout=dropout)
+
+ def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
+ enc_output, enc_slf_attn = self.slf_attn(
+ enc_input, enc_input, enc_input, mask=slf_attn_mask)
+ enc_output *= non_pad_mask
+
+ enc_output = self.pos_ffn(enc_output)
+ enc_output *= non_pad_mask
+
+ return enc_output, enc_slf_attn
+
+
+class ConvNorm(torch.nn.Module):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=None,
+ dilation=1,
+ bias=True,
+ w_init_gain='linear'):
+ super(ConvNorm, self).__init__()
+
+ if padding is None:
+ assert(kernel_size % 2 == 1)
+ padding = int(dilation * (kernel_size - 1) / 2)
+
+ self.conv = torch.nn.Conv1d(in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ bias=bias)
+
+ torch.nn.init.xavier_uniform_(
+ self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+ def forward(self, signal):
+ conv_signal = self.conv(signal)
+
+ return conv_signal
+
+
+class PostNet(nn.Module):
+ """
+ PostNet: Five 1-d convolution with 512 channels and kernel size 5
+ """
+
+ def __init__(self,
+ n_mel_channels=80,
+ postnet_embedding_dim=512,
+ postnet_kernel_size=5,
+ postnet_n_convolutions=5):
+
+ super(PostNet, self).__init__()
+ self.convolutions = nn.ModuleList()
+
+ self.convolutions.append(
+ nn.Sequential(
+ ConvNorm(n_mel_channels,
+ postnet_embedding_dim,
+ kernel_size=postnet_kernel_size,
+ stride=1,
+ padding=int((postnet_kernel_size - 1) / 2),
+ dilation=1,
+ w_init_gain='tanh'),
+
+ nn.BatchNorm1d(postnet_embedding_dim))
+ )
+
+ for i in range(1, postnet_n_convolutions - 1):
+ self.convolutions.append(
+ nn.Sequential(
+ ConvNorm(postnet_embedding_dim,
+ postnet_embedding_dim,
+ kernel_size=postnet_kernel_size,
+ stride=1,
+ padding=int((postnet_kernel_size - 1) / 2),
+ dilation=1,
+ w_init_gain='tanh'),
+
+ nn.BatchNorm1d(postnet_embedding_dim))
+ )
+
+ self.convolutions.append(
+ nn.Sequential(
+ ConvNorm(postnet_embedding_dim,
+ n_mel_channels,
+ kernel_size=postnet_kernel_size,
+ stride=1,
+ padding=int((postnet_kernel_size - 1) / 2),
+ dilation=1,
+ w_init_gain='linear'),
+
+ nn.BatchNorm1d(n_mel_channels))
+ )
+
+ def forward(self, x):
+ x = x.contiguous().transpose(1, 2)
+
+ for i in range(len(self.convolutions) - 1):
+ x = F.dropout(torch.tanh(
+ self.convolutions[i](x)), 0.5, self.training)
+ x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
+
+ x = x.contiguous().transpose(1, 2)
+ return x
diff --git a/FastSpeech/transformer/Models.py b/FastSpeech/transformer/Models.py
new file mode 100644
index 0000000..37a2e42
--- /dev/null
+++ b/FastSpeech/transformer/Models.py
@@ -0,0 +1,145 @@
+import torch
+import torch.nn as nn
+import numpy as np
+
+import transformer.Constants as Constants
+from transformer.Layers import FFTBlock, PreNet, PostNet, Linear
+from text.symbols import symbols
+import hparams as hp
+
+
+def get_non_pad_mask(seq):
+ assert seq.dim() == 2
+ return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1)
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+ ''' Sinusoid position encoding table '''
+
+ def cal_angle(position, hid_idx):
+ return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
+
+ def get_posi_angle_vec(position):
+ return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
+
+ sinusoid_table = np.array([get_posi_angle_vec(pos_i)
+ for pos_i in range(n_position)])
+
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+
+ if padding_idx is not None:
+ # zero vector for padding dimension
+ sinusoid_table[padding_idx] = 0.
+
+ return torch.FloatTensor(sinusoid_table)
+
+
+def get_attn_key_pad_mask(seq_k, seq_q):
+ ''' For masking out the padding part of key sequence. '''
+
+ # Expand to fit the shape of key query attention matrix.
+ len_q = seq_q.size(1)
+ padding_mask = seq_k.eq(Constants.PAD)
+ padding_mask = padding_mask.unsqueeze(
+ 1).expand(-1, len_q, -1) # b x lq x lk
+
+ return padding_mask
+
+
+class Encoder(nn.Module):
+ ''' Encoder '''
+
+ def __init__(self,
+ n_src_vocab=len(symbols)+1,
+ len_max_seq=hp.max_sep_len,
+ d_word_vec=hp.word_vec_dim,
+ n_layers=hp.encoder_n_layer,
+ n_head=hp.encoder_head,
+ d_k=64,
+ d_v=64,
+ d_model=hp.word_vec_dim,
+ d_inner=hp.encoder_conv1d_filter_size,
+ dropout=hp.dropout):
+
+ super(Encoder, self).__init__()
+
+ n_position = len_max_seq + 1
+
+ self.src_word_emb = nn.Embedding(
+ n_src_vocab, d_word_vec, padding_idx=Constants.PAD)
+
+ self.position_enc = nn.Embedding.from_pretrained(
+ get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
+ freeze=True)
+
+ self.layer_stack = nn.ModuleList([FFTBlock(
+ d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
+
+ def forward(self, src_seq, src_pos, return_attns=False):
+
+ enc_slf_attn_list = []
+
+ # -- Prepare masks
+ slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq)
+ non_pad_mask = get_non_pad_mask(src_seq)
+
+ # -- Forward
+ enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos)
+
+ for enc_layer in self.layer_stack:
+ enc_output, enc_slf_attn = enc_layer(
+ enc_output,
+ non_pad_mask=non_pad_mask,
+ slf_attn_mask=slf_attn_mask)
+ if return_attns:
+ enc_slf_attn_list += [enc_slf_attn]
+
+ return enc_output, non_pad_mask
+
+
+class Decoder(nn.Module):
+ """ Decoder """
+
+ def __init__(self,
+ len_max_seq=hp.max_sep_len,
+ d_word_vec=hp.word_vec_dim,
+ n_layers=hp.decoder_n_layer,
+ n_head=hp.decoder_head,
+ d_k=64,
+ d_v=64,
+ d_model=hp.word_vec_dim,
+ d_inner=hp.decoder_conv1d_filter_size,
+ dropout=hp.dropout):
+
+ super(Decoder, self).__init__()
+
+ n_position = len_max_seq + 1
+
+ self.position_enc = nn.Embedding.from_pretrained(
+ get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
+ freeze=True)
+
+ self.layer_stack = nn.ModuleList([FFTBlock(
+ d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
+
+ def forward(self, enc_seq, enc_pos, return_attns=False):
+
+ dec_slf_attn_list = []
+
+ # -- Prepare masks
+ slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
+ non_pad_mask = get_non_pad_mask(enc_pos)
+
+ # -- Forward
+ dec_output = enc_seq + self.position_enc(enc_pos)
+
+ for dec_layer in self.layer_stack:
+ dec_output, dec_slf_attn = dec_layer(
+ dec_output,
+ non_pad_mask=non_pad_mask,
+ slf_attn_mask=slf_attn_mask)
+ if return_attns:
+ dec_slf_attn_list += [dec_slf_attn]
+
+ return dec_output
diff --git a/FastSpeech/transformer/Modules.py b/FastSpeech/transformer/Modules.py
new file mode 100644
index 0000000..a9101a5
--- /dev/null
+++ b/FastSpeech/transformer/Modules.py
@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+class ScaledDotProductAttention(nn.Module):
+ ''' Scaled Dot-Product Attention '''
+
+ def __init__(self, temperature, attn_dropout=0.1):
+ super().__init__()
+ self.temperature = temperature
+ self.dropout = nn.Dropout(attn_dropout)
+ self.softmax = nn.Softmax(dim=2)
+
+ def forward(self, q, k, v, mask=None):
+
+ attn = torch.bmm(q, k.transpose(1, 2))
+ attn = attn / self.temperature
+
+ if mask is not None:
+ attn = attn.masked_fill(mask, -np.inf)
+
+ attn = self.softmax(attn)
+ attn = self.dropout(attn)
+ output = torch.bmm(attn, v)
+
+ return output, attn
diff --git a/FastSpeech/transformer/SubLayers.py b/FastSpeech/transformer/SubLayers.py
new file mode 100644
index 0000000..5a92810
--- /dev/null
+++ b/FastSpeech/transformer/SubLayers.py
@@ -0,0 +1,97 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+from transformer.Modules import ScaledDotProductAttention
+import hparams as hp
+
+
+class MultiHeadAttention(nn.Module):
+ ''' Multi-Head Attention module '''
+
+ def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+ super().__init__()
+
+ self.n_head = n_head
+ self.d_k = d_k
+ self.d_v = d_v
+
+ self.w_qs = nn.Linear(d_model, n_head * d_k)
+ self.w_ks = nn.Linear(d_model, n_head * d_k)
+ self.w_vs = nn.Linear(d_model, n_head * d_v)
+ nn.init.normal_(self.w_qs.weight, mean=0,
+ std=np.sqrt(2.0 / (d_model + d_k)))
+ nn.init.normal_(self.w_ks.weight, mean=0,
+ std=np.sqrt(2.0 / (d_model + d_k)))
+ nn.init.normal_(self.w_vs.weight, mean=0,
+ std=np.sqrt(2.0 / (d_model + d_v)))
+
+ self.attention = ScaledDotProductAttention(
+ temperature=np.power(d_k, 0.5))
+ self.layer_norm = nn.LayerNorm(d_model)
+
+ self.fc = nn.Linear(n_head * d_v, d_model)
+ nn.init.xavier_normal_(self.fc.weight)
+
+ self.dropout = nn.Dropout(dropout)
+
+ def forward(self, q, k, v, mask=None):
+
+ d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+
+ sz_b, len_q, _ = q.size()
+ sz_b, len_k, _ = k.size()
+ sz_b, len_v, _ = v.size()
+
+ residual = q
+
+ q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+ k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+ v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+
+ q = q.permute(2, 0, 1, 3).contiguous().view(-1,
+ len_q, d_k) # (n*b) x lq x dk
+ k = k.permute(2, 0, 1, 3).contiguous().view(-1,
+ len_k, d_k) # (n*b) x lk x dk
+ v = v.permute(2, 0, 1, 3).contiguous().view(-1,
+ len_v, d_v) # (n*b) x lv x dv
+
+ mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
+ output, attn = self.attention(q, k, v, mask=mask)
+
+ output = output.view(n_head, sz_b, len_q, d_v)
+ output = output.permute(1, 2, 0, 3).contiguous().view(
+ sz_b, len_q, -1) # b x lq x (n*dv)
+
+ output = self.dropout(self.fc(output))
+ output = self.layer_norm(output + residual)
+
+ return output, attn
+
+
+class PositionwiseFeedForward(nn.Module):
+ ''' A two-feed-forward-layer module '''
+
+ def __init__(self, d_in, d_hid, dropout=0.1):
+ super().__init__()
+
+ # Use Conv1D
+ # position-wise
+ self.w_1 = nn.Conv1d(
+ d_in, d_hid, kernel_size=hp.fft_conv1d_kernel, padding=hp.fft_conv1d_padding)
+ # position-wise
+ self.w_2 = nn.Conv1d(
+ d_hid, d_in, kernel_size=hp.fft_conv1d_kernel, padding=hp.fft_conv1d_padding)
+
+ self.layer_norm = nn.LayerNorm(d_in)
+ self.dropout = nn.Dropout(dropout)
+
+ def forward(self, x):
+ residual = x
+ output = x.transpose(1, 2)
+ output = self.w_2(F.relu(self.w_1(output)))
+ output = output.transpose(1, 2)
+ output = self.dropout(output)
+ output = self.layer_norm(output + residual)
+
+ return output
diff --git a/FastSpeech/transformer/__init__.py b/FastSpeech/transformer/__init__.py
new file mode 100644
index 0000000..068f707
--- /dev/null
+++ b/FastSpeech/transformer/__init__.py
@@ -0,0 +1,6 @@
+import transformer.Constants
+import transformer.Modules
+import transformer.Layers
+import transformer.SubLayers
+import transformer.Models
+import transformer.Beam
diff --git a/FastSpeech/utils.py b/FastSpeech/utils.py
new file mode 100644
index 0000000..25eada1
--- /dev/null
+++ b/FastSpeech/utils.py
@@ -0,0 +1,183 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import os
+
+import tacotron2 as Tacotron2
+import text
+import hparams
+
+
+def process_text(train_text_path):
+ with open(train_text_path, "r", encoding="utf-8") as f:
+ txt = []
+ for line in f.readlines():
+ txt.append(line)
+
+ return txt
+
+
+def get_param_num(model):
+ num_param = sum(param.numel() for param in model.parameters())
+ return num_param
+
+
+def plot_data(data, figsize=(12, 4)):
+ _, axes = plt.subplots(1, len(data), figsize=figsize)
+ for i in range(len(data)):
+ axes[i].imshow(data[i], aspect='auto',
+ origin='bottom', interpolation='none')
+
+ if not os.path.exists("img"):
+ os.mkdir("img")
+ plt.savefig(os.path.join("img", "model_test.jpg"))
+
+
+def get_mask_from_lengths(lengths, max_len=None):
+ if max_len == None:
+ max_len = torch.max(lengths).item()
+
+ ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
+ mask = (ids < lengths.unsqueeze(1)).byte()
+
+ return mask
+
+
+def get_WaveGlow():
+ waveglow_path = os.path.join("waveglow", "pretrained_model")
+ waveglow_path = os.path.join(waveglow_path, "waveglow_256channels.pt")
+ wave_glow = torch.load(waveglow_path)['model']
+ wave_glow = wave_glow.remove_weightnorm(wave_glow)
+ wave_glow.cuda().eval()
+ for m in wave_glow.modules():
+ if 'Conv' in str(type(m)):
+ setattr(m, 'padding_mode', 'zeros')
+
+ return wave_glow
+
+
+def get_Tacotron2():
+ checkpoint_path = "tacotron2_statedict.pt"
+ checkpoint_path = os.path.join(os.path.join(
+ "Tacotron2", "pretrained_model"), checkpoint_path)
+
+ model = Tacotron2.model.Tacotron2(
+ Tacotron2.hparams.create_hparams()).cuda()
+ model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
+ _ = model.cuda().eval()
+
+ return model
+
+
+def get_D(alignment):
+ D = np.array([0 for _ in range(np.shape(alignment)[1])])
+
+ for i in range(np.shape(alignment)[0]):
+ max_index = alignment[i].tolist().index(alignment[i].max())
+ D[max_index] = D[max_index] + 1
+
+ return D
+
+
+def pad_1D(inputs, PAD=0):
+
+ def pad_data(x, length, PAD):
+ x_padded = np.pad(x, (0, length - x.shape[0]),
+ mode='constant',
+ constant_values=PAD)
+ return x_padded
+
+ max_len = max((len(x) for x in inputs))
+ padded = np.stack([pad_data(x, max_len, PAD) for x in inputs])
+
+ return padded
+
+
+def pad_2D(inputs, maxlen=None):
+
+ def pad(x, max_len):
+ PAD = 0
+ if np.shape(x)[0] > max_len:
+ raise ValueError("not max_len")
+
+ s = np.shape(x)[1]
+ x_padded = np.pad(x, (0, max_len - np.shape(x)[0]),
+ mode='constant',
+ constant_values=PAD)
+ return x_padded[:, :s]
+
+ if maxlen:
+ output = np.stack([pad(x, maxlen) for x in inputs])
+ else:
+ max_len = max(np.shape(x)[0] for x in inputs)
+ output = np.stack([pad(x, max_len) for x in inputs])
+
+ return output
+
+
+def pad(input_ele, mel_max_length=None):
+ if mel_max_length:
+ out_list = list()
+ max_len = mel_max_length
+ for i, batch in enumerate(input_ele):
+ one_batch_padded = F.pad(
+ batch, (0, 0, 0, max_len-batch.size(0)), "constant", 0.0)
+ out_list.append(one_batch_padded)
+ out_padded = torch.stack(out_list)
+ return out_padded
+ else:
+ out_list = list()
+ max_len = max([input_ele[i].size(0)for i in range(len(input_ele))])
+
+ for i, batch in enumerate(input_ele):
+ one_batch_padded = F.pad(
+ batch, (0, 0, 0, max_len-batch.size(0)), "constant", 0.0)
+ out_list.append(one_batch_padded)
+ out_padded = torch.stack(out_list)
+ return out_padded
+
+
+def load_data(txt, mel, model):
+ character = text.text_to_sequence(txt, hparams.text_cleaners)
+ character = torch.from_numpy(np.stack([np.array(character)])).long().cuda()
+
+ text_length = torch.Tensor([character.size(1)]).long().cuda()
+ mel = torch.from_numpy(np.stack([mel.T])).float().cuda()
+ max_len = mel.size(2)
+ output_length = torch.Tensor([max_len]).long().cuda()
+
+ inputs = character, text_length, mel, max_len, output_length
+
+ with torch.no_grad():
+ [_, mel_tacotron2, _, alignment], cemb = model.forward(inputs)
+
+ alignment = alignment[0].cpu().numpy()
+ cemb = cemb[0].cpu().numpy()
+
+ D = get_D(alignment)
+ D = np.array(D)
+
+ mel_tacotron2 = mel_tacotron2[0].cpu().numpy()
+
+ return mel_tacotron2, cemb, D
+
+
+def load_data_from_tacotron2(txt, model):
+ character = text.text_to_sequence(txt, hparams.text_cleaners)
+ character = torch.from_numpy(np.stack([np.array(character)])).long().cuda()
+
+ with torch.no_grad():
+ [_, mel, _, alignment], cemb = model.inference(character)
+
+ alignment = alignment[0].cpu().numpy()
+ cemb = cemb[0].cpu().numpy()
+
+ D = get_D(alignment)
+ D = np.array(D)
+
+ mel = mel[0].cpu().numpy()
+
+ return mel, cemb, D
diff --git a/FastSpeech/waveglow/__init__.py b/FastSpeech/waveglow/__init__.py
new file mode 100644
index 0000000..2da378a
--- /dev/null
+++ b/FastSpeech/waveglow/__init__.py
@@ -0,0 +1,3 @@
+import waveglow.inference
+import waveglow.mel2samp
+import waveglow.glow
diff --git a/FastSpeech/waveglow/convert_model.py b/FastSpeech/waveglow/convert_model.py
new file mode 100644
index 0000000..2485e7e
--- /dev/null
+++ b/FastSpeech/waveglow/convert_model.py
@@ -0,0 +1,46 @@
+import sys
+import copy
+import torch
+
+def _check_model_old_version(model):
+ if hasattr(model.WN[0], 'res_layers'):
+ return True
+ else:
+ return False
+
+def update_model(old_model):
+ if not _check_model_old_version(old_model):
+ return old_model
+ new_model = copy.deepcopy(old_model)
+ for idx in range(0, len(new_model.WN)):
+ wavenet = new_model.WN[idx]
+ wavenet.res_skip_layers = torch.nn.ModuleList()
+ n_channels = wavenet.n_channels
+ n_layers = wavenet.n_layers
+ for i in range(0, n_layers):
+ if i < n_layers - 1:
+ res_skip_channels = 2*n_channels
+ else:
+ res_skip_channels = n_channels
+ res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+ skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i])
+ if i < n_layers - 1:
+ res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i])
+ res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight]))
+ res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias]))
+ else:
+ res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight)
+ res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ wavenet.res_skip_layers.append(res_skip_layer)
+ del wavenet.res_layers
+ del wavenet.skip_layers
+ return new_model
+
+if __name__ == '__main__':
+ old_model_path = sys.argv[1]
+ new_model_path = sys.argv[2]
+ model = torch.load(old_model_path)
+ model['model'] = update_model(model['model'])
+ torch.save(model, new_model_path)
+
diff --git a/FastSpeech/waveglow/glow.py b/FastSpeech/waveglow/glow.py
new file mode 100644
index 0000000..a04afed
--- /dev/null
+++ b/FastSpeech/waveglow/glow.py
@@ -0,0 +1,310 @@
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import copy
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a+input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+class WaveGlowLoss(torch.nn.Module):
+ def __init__(self, sigma=1.0):
+ super(WaveGlowLoss, self).__init__()
+ self.sigma = sigma
+
+ def forward(self, model_output):
+ z, log_s_list, log_det_W_list = model_output
+ for i, log_s in enumerate(log_s_list):
+ if i == 0:
+ log_s_total = torch.sum(log_s)
+ log_det_W_total = log_det_W_list[i]
+ else:
+ log_s_total = log_s_total + torch.sum(log_s)
+ log_det_W_total += log_det_W_list[i]
+
+ loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
+ return loss/(z.size(0)*z.size(1)*z.size(2))
+
+
+class Invertible1x1Conv(torch.nn.Module):
+ """
+ The layer outputs both the convolution, and the log determinant
+ of its weight matrix. If reverse=True it does convolution with
+ inverse
+ """
+ def __init__(self, c):
+ super(Invertible1x1Conv, self).__init__()
+ self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
+ bias=False)
+
+ # Sample a random orthonormal matrix to initialize weights
+ W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+
+ # Ensure determinant is 1.0 not -1.0
+ if torch.det(W) < 0:
+ W[:,0] = -1*W[:,0]
+ W = W.view(c, c, 1)
+ self.conv.weight.data = W
+
+ def forward(self, z, reverse=False):
+ # shape
+ batch_size, group_size, n_of_groups = z.size()
+
+ W = self.conv.weight.squeeze()
+
+ if reverse:
+ if not hasattr(self, 'W_inverse'):
+ # Reverse computation
+ W_inverse = W.float().inverse()
+ W_inverse = Variable(W_inverse[..., None])
+ if z.type() == 'torch.cuda.HalfTensor':
+ W_inverse = W_inverse.half()
+ self.W_inverse = W_inverse
+ z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+ return z
+ else:
+ # Forward computation
+ log_det_W = batch_size * n_of_groups * torch.logdet(W)
+ z = self.conv(z)
+ return z, log_det_W
+
+
+class WN(torch.nn.Module):
+ """
+ This is the WaveNet like layer for the affine coupling. The primary difference
+ from WaveNet is the convolutions need not be causal. There is also no dilation
+ size reset. The dilation only doubles on each layer
+ """
+ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+ kernel_size):
+ super(WN, self).__init__()
+ assert(kernel_size % 2 == 1)
+ assert(n_channels % 2 == 0)
+ self.n_layers = n_layers
+ self.n_channels = n_channels
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.cond_layers = torch.nn.ModuleList()
+
+ start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+ start = torch.nn.utils.weight_norm(start, name='weight')
+ self.start = start
+
+ # Initializing last layer to 0 makes the affine coupling layers
+ # do nothing at first. This helps with training stability
+ end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
+ end.weight.data.zero_()
+ end.bias.data.zero_()
+ self.end = end
+
+ for i in range(n_layers):
+ dilation = 2 ** i
+ padding = int((kernel_size*dilation - dilation)/2)
+ in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
+ dilation=dilation, padding=padding)
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+ self.in_layers.append(in_layer)
+
+ cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
+ cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+ self.cond_layers.append(cond_layer)
+
+ # last one is not necessary
+ if i < n_layers - 1:
+ res_skip_channels = 2*n_channels
+ else:
+ res_skip_channels = n_channels
+ res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, forward_input):
+ audio, spect = forward_input
+ audio = self.start(audio)
+
+ for i in range(self.n_layers):
+ acts = fused_add_tanh_sigmoid_multiply(
+ self.in_layers[i](audio),
+ self.cond_layers[i](spect),
+ torch.IntTensor([self.n_channels]))
+
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ audio = res_skip_acts[:,:self.n_channels,:] + audio
+ skip_acts = res_skip_acts[:,self.n_channels:,:]
+ else:
+ skip_acts = res_skip_acts
+
+ if i == 0:
+ output = skip_acts
+ else:
+ output = skip_acts + output
+ return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+ def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
+ n_early_size, WN_config):
+ super(WaveGlow, self).__init__()
+
+ self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
+ n_mel_channels,
+ 1024, stride=256)
+ assert(n_group % 2 == 0)
+ self.n_flows = n_flows
+ self.n_group = n_group
+ self.n_early_every = n_early_every
+ self.n_early_size = n_early_size
+ self.WN = torch.nn.ModuleList()
+ self.convinv = torch.nn.ModuleList()
+
+ n_half = int(n_group/2)
+
+ # Set up layers with the right sizes based on how many dimensions
+ # have been output already
+ n_remaining_channels = n_group
+ for k in range(n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ n_half = n_half - int(self.n_early_size/2)
+ n_remaining_channels = n_remaining_channels - self.n_early_size
+ self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+ self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
+ self.n_remaining_channels = n_remaining_channels # Useful during inference
+
+ def forward(self, forward_input):
+ """
+ forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
+ forward_input[1] = audio: batch x time
+ """
+ spect, audio = forward_input
+
+ # Upsample spectrogram to size of audio
+ spect = self.upsample(spect)
+ assert(spect.size(2) >= audio.size(1))
+ if spect.size(2) > audio.size(1):
+ spect = spect[:, :, :audio.size(1)]
+
+ spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+ spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+ audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+ output_audio = []
+ log_s_list = []
+ log_det_W_list = []
+
+ for k in range(self.n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ output_audio.append(audio[:,:self.n_early_size,:])
+ audio = audio[:,self.n_early_size:,:]
+
+ audio, log_det_W = self.convinv[k](audio)
+ log_det_W_list.append(log_det_W)
+
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:,:n_half,:]
+ audio_1 = audio[:,n_half:,:]
+
+ output = self.WN[k]((audio_0, spect))
+ log_s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+ audio_1 = torch.exp(log_s)*audio_1 + b
+ log_s_list.append(log_s)
+
+ audio = torch.cat([audio_0, audio_1],1)
+
+ output_audio.append(audio)
+ return torch.cat(output_audio,1), log_s_list, log_det_W_list
+
+ def infer(self, spect, sigma=1.0):
+ spect = self.upsample(spect)
+ # trim conv artifacts. maybe pad spec to kernel multiple
+ time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+ spect = spect[:, :, :-time_cutoff]
+
+ spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+ spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+ if spect.type() == 'torch.cuda.HalfTensor':
+ audio = torch.cuda.HalfTensor(spect.size(0),
+ self.n_remaining_channels,
+ spect.size(2)).normal_()
+ else:
+ audio = torch.cuda.FloatTensor(spect.size(0),
+ self.n_remaining_channels,
+ spect.size(2)).normal_()
+
+ audio = torch.autograd.Variable(sigma*audio)
+
+ for k in reversed(range(self.n_flows)):
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:,:n_half,:]
+ audio_1 = audio[:,n_half:,:]
+
+ output = self.WN[k]((audio_0, spect))
+ s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+ audio_1 = (audio_1 - b)/torch.exp(s)
+ audio = torch.cat([audio_0, audio_1],1)
+
+ audio = self.convinv[k](audio, reverse=True)
+
+ if k % self.n_early_every == 0 and k > 0:
+ if spect.type() == 'torch.cuda.HalfTensor':
+ z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
+ else:
+ z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
+ audio = torch.cat((sigma*z, audio),1)
+
+ audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
+ return audio
+
+ @staticmethod
+ def remove_weightnorm(model):
+ waveglow = model
+ for WN in waveglow.WN:
+ WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+ WN.in_layers = remove(WN.in_layers)
+ WN.cond_layers = remove(WN.cond_layers)
+ WN.res_skip_layers = remove(WN.res_skip_layers)
+ return waveglow
+
+
+def remove(conv_list):
+ new_conv_list = torch.nn.ModuleList()
+ for old_conv in conv_list:
+ old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+ new_conv_list.append(old_conv)
+ return new_conv_list
diff --git a/FastSpeech/waveglow/inference.py b/FastSpeech/waveglow/inference.py
new file mode 100644
index 0000000..0c07ac4
--- /dev/null
+++ b/FastSpeech/waveglow/inference.py
@@ -0,0 +1,57 @@
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import os
+from scipy.io.wavfile import write
+import torch
+from waveglow.mel2samp import files_to_list, MAX_WAV_VALUE
+# from denoiser import Denoiser
+
+
+def inference(mel, waveglow, audio_path, sigma=1.0, sampling_rate=22050):
+ with torch.no_grad():
+ audio = waveglow.infer(mel, sigma=sigma)
+ audio = audio * MAX_WAV_VALUE
+ audio = audio.squeeze()
+ audio = audio.cpu().numpy()
+ audio = audio.astype('int16')
+ write(audio_path, sampling_rate, audio)
+
+
+def test_speed(mel, waveglow, sigma=1.0, sampling_rate=22050):
+ with torch.no_grad():
+ audio = waveglow.infer(mel, sigma=sigma)
+ audio = audio * MAX_WAV_VALUE
+
+
+def get_wav(mel, waveglow, sigma=1.0, sampling_rate=22050):
+ with torch.no_grad():
+ audio = waveglow.infer(mel, sigma=sigma)
+ audio = audio * MAX_WAV_VALUE
+ audio = audio.squeeze()
+ audio = audio.cpu()
+
+ return audio
diff --git a/FastSpeech/waveglow/mel2samp.py b/FastSpeech/waveglow/mel2samp.py
new file mode 100644
index 0000000..df3e45a
--- /dev/null
+++ b/FastSpeech/waveglow/mel2samp.py
@@ -0,0 +1,147 @@
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************\
+# from tacotron2.layers import TacotronSTFT
+import os
+import random
+import argparse
+import json
+import torch
+import torch.utils.data
+import sys
+from scipy.io.wavfile import read
+
+# We're using the audio processing from TacoTron2 to make sure it matches
+sys.path.insert(0, 'tacotron2')
+
+MAX_WAV_VALUE = 32768.0
+
+
+def files_to_list(filename):
+ """
+ Takes a text file of filenames and makes a list of filenames
+ """
+ with open(filename, encoding='utf-8') as f:
+ files = f.readlines()
+
+ files = [f.rstrip() for f in files]
+ return files
+
+
+# def load_wav_to_torch(full_path):
+# """
+# Loads wavdata into torch array
+# """
+# sampling_rate, data = read(full_path)
+# return torch.from_numpy(data).float(), sampling_rate
+
+
+# class Mel2Samp(torch.utils.data.Dataset):
+# """
+# This is the main class that calculates the spectrogram and returns the
+# spectrogram, audio pair.
+# """
+
+# def __init__(self, training_files, segment_length, filter_length,
+# hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
+# self.audio_files = files_to_list(training_files)
+# random.seed(1234)
+# random.shuffle(self.audio_files)
+# self.stft = TacotronSTFT(filter_length=filter_length,
+# hop_length=hop_length,
+# win_length=win_length,
+# sampling_rate=sampling_rate,
+# mel_fmin=mel_fmin, mel_fmax=mel_fmax)
+# self.segment_length = segment_length
+# self.sampling_rate = sampling_rate
+
+# def get_mel(self, audio):
+# audio_norm = audio / MAX_WAV_VALUE
+# audio_norm = audio_norm.unsqueeze(0)
+# audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+# melspec = self.stft.mel_spectrogram(audio_norm)
+# melspec = torch.squeeze(melspec, 0)
+# return melspec
+
+# def __getitem__(self, index):
+# # Read audio
+# filename = self.audio_files[index]
+# audio, sampling_rate = load_wav_to_torch(filename)
+# if sampling_rate != self.sampling_rate:
+# raise ValueError("{} SR doesn't match target {} SR".format(
+# sampling_rate, self.sampling_rate))
+
+# # Take segment
+# if audio.size(0) >= self.segment_length:
+# max_audio_start = audio.size(0) - self.segment_length
+# audio_start = random.randint(0, max_audio_start)
+# audio = audio[audio_start:audio_start+self.segment_length]
+# else:
+# audio = torch.nn.functional.pad(
+# audio, (0, self.segment_length - audio.size(0)), 'constant').data
+
+# mel = self.get_mel(audio)
+# audio = audio / MAX_WAV_VALUE
+
+# return (mel, audio)
+
+# def __len__(self):
+# return len(self.audio_files)
+
+
+# # ===================================================================
+# # Takes directory of clean audio and makes directory of spectrograms
+# # Useful for making test sets
+# # ===================================================================
+# if __name__ == "__main__":
+# # Get defaults so it can work with no Sacred
+# parser = argparse.ArgumentParser()
+# parser.add_argument('-f', "--filelist_path", required=True)
+# parser.add_argument('-c', '--config', type=str,
+# help='JSON file for configuration')
+# parser.add_argument('-o', '--output_dir', type=str,
+# help='Output directory')
+# args = parser.parse_args()
+
+# with open(args.config) as f:
+# data = f.read()
+# data_config = json.loads(data)["data_config"]
+# mel2samp = Mel2Samp(**data_config)
+
+# filepaths = files_to_list(args.filelist_path)
+
+# # Make directory if it doesn't exist
+# if not os.path.isdir(args.output_dir):
+# os.makedirs(args.output_dir)
+# os.chmod(args.output_dir, 0o775)
+
+# for filepath in filepaths:
+# audio, sr = load_wav_to_torch(filepath)
+# melspectrogram = mel2samp.get_mel(audio)
+# filename = os.path.basename(filepath)
+# new_filepath = args.output_dir + '/' + filename + '.pt'
+# print(new_filepath)
+# torch.save(melspectrogram, new_filepath)
diff --git a/SqueezeWave/README.md b/SqueezeWave/README.md
new file mode 100644
index 0000000..a09bbc0
--- /dev/null
+++ b/SqueezeWave/README.md
@@ -0,0 +1,129 @@
+## SqueezeWave: Extremely Lightweight Vocoders for On-device Speech Synthesis
+By Bohan Zhai *, Tianren Gao *, Flora Xue, Daniel Rothchild, Bichen Wu, Joseph Gonzalez, and Kurt Keutzer (UC Berkeley)
+
+Automatic speech synthesis is a challenging task that is becoming increasingly important as edge devices begin to interact with users through speech. Typical text-to-speech pipelines include a vocoder, which translates intermediate audio representations into an audio waveform. Most existing vocoders are difficult to parallelize since each generated sample is conditioned on previous samples. WaveGlow is a flow-based feed-forward alternative to these auto-regressive models (Prenger et al., 2019). However, while WaveGlow can be easily parallelized, the model is too expensive for real-time speech synthesis on the edge. This paper presents SqueezeWave, a family of lightweight vocoders based on WaveGlow that can generate audio of similar quality to WaveGlow with 61x - 214x fewer MACs.
+
+Link to the paper: [paper]. If you find this work useful, please consider citing
+
+ ```
+ @inproceedings{squeezewave,
+ Author = {Bohan Zhai, Tianren Gao, Flora Xue, Daniel Rothchild, Bichen Wu, Joseph Gonzalez, Kurt Keutzer},
+ Title = {SqueezeWave: Extremely Lightweight Vocoders for On-device Speech Synthesis},
+ Journal = {arXiv:2001.05685},
+ Year = {2020}
+ }
+ ```
+
+### Audio samples generated by SqueezeWave
+Audio samples of SqueezeWave are here: https://tianrengao.github.io/SqueezeWaveDemo/
+
+### Results
+We introduce 4 variants of SqueezeWave in our paper. See the table below.
+
+
+ | Model | length | n_channels| MACs | Reduction | MOS |
+ | --------------- | ------ | --------- | ----- | --------- | --------- |
+ |WaveGlow | 2048 | 8 | 228.9 | 1x | 4.57±0.04 |
+ |SqueezeWave-128L | 128 | 256 | 3.78 | 60x | 4.07±0.06 |
+ |SqueezeWave-64L | 64 | 256 | 2.16 | 106x | 3.77±0.05 |
+ |SqueezeWave-128S | 128 | 128 | 1.06 | 214x | 3.79±0.05 |
+ |SqueezeWave-64S | 64 | 128 | 0.68 | 332x | 2.74±0.04 |
+
+### Model Complexity
+A detailed MAC calculation can be found from [here](https://github.com/tianrengao/SqueezeWave/blob/master/SqueezeWave_computational_complexity.ipynb)
+
+## Setup
+0. (Optional) Create a virtual environment
+
+ ```
+ virtualenv env
+ source env/bin/activate
+ ```
+
+1. Clone our repo and initialize submodule
+
+ ```command
+ git clone https://github.com/tianrengao/SqueezeWave.git
+ cd SqueezeWave
+ git submodule init
+ git submodule update
+ ```
+
+2. Install requirements
+```pip3 install -r requirements.txt```
+
+3. Install [Apex]
+ ```1
+ cd ../
+ git clone https://www.github.com/nvidia/apex
+ cd apex
+ python setup.py install
+ ```
+
+## Generate audio with our pretrained model
+
+1. Download our [pretrained models]. We provide 4 pretrained models as described in the paper.
+2. Download [mel-spectrograms]
+3. Generate audio. Please replace `SqueezeWave.pt` to the specific pretrained model's name.
+
+ ```python3 inference.py -f <(ls mel_spectrograms/*.pt) -w SqueezeWave.pt -o . --is_fp16 -s 0.6```
+
+
+## Train your own model
+
+1. Download [LJ Speech Data]. We assume all the waves are stored in the directory `^/data/`
+
+2. Make a list of the file names to use for training/testing
+
+ ```command
+ ls data/*.wav | tail -n+10 > train_files.txt
+ ls data/*.wav | head -n10 > test_files.txt
+ ```
+
+3. We provide 4 model configurations with audio channel and channel numbers specified in the table below. The configuration files are under ```/configs``` directory. To choose the model you want to train, select the corresponding configuration file.
+
+4. Train your SqueezeWave model
+
+ ```command
+ mkdir checkpoints
+ python train.py -c configs/config_a256_c128.json
+ ```
+
+ For multi-GPU training replace `train.py` with `distributed.py`. Only tested with single node and NCCL.
+
+ For mixed precision training set `"fp16_run": true` on `config.json`.
+
+5. Make test set mel-spectrograms
+
+ ```
+ mkdir -p eval/mels
+ python3 mel2samp.py -f test_files.txt -o eval/mels -c configs/config_a128_c256.json
+ ```
+
+6. Run inference on the test data.
+
+ ```command
+ ls eval/mels > eval/mel_files.txt
+ sed -i -e 's_.*_eval/mels/&_' eval/mel_files.txt
+ mkdir -p eval/output
+ python3 inference.py -f eval/mel_files.txt -w checkpoints/SqueezeWave_10000 -o eval/output --is_fp16 -s 0.6
+ ```
+ Replace `SqueezeWave_10000` with the checkpoint you want to test.
+
+## Credits
+The implementation of this work is based on WaveGlow: https://github.com/NVIDIA/waveglow
+
+
+[//]: # (TODO)
+[//]: # (PROVIDE INSTRUCTIONS FOR DOWNLOADING LJS)
+[pytorch 1.0]: https://github.com/pytorch/pytorch#installation
+[website]: https://nv-adlr.github.io/WaveGlow
+[paper]: https://arxiv.org/abs/2001.05685
+[WaveNet implementation]: https://github.com/r9y9/wavenet_vocoder
+[Glow]: https://blog.openai.com/glow/
+[WaveNet]: https://deepmind.com/blog/wavenet-generative-model-raw-audio/
+[PyTorch]: http://pytorch.org
+[pretrained models]: https://drive.google.com/file/d/1RyVMLY2l8JJGq_dCEAAd8rIRIn_k13UB/view?usp=sharing
+[mel-spectrograms]: https://drive.google.com/file/d/1g_VXK2lpP9J25dQFhQwx7doWl_p20fXA/view?usp=sharing
+[LJ Speech Data]: https://keithito.com/LJ-Speech-Dataset
+[Apex]: https://github.com/nvidia/apex
diff --git a/SqueezeWave/SqueezeWave_computational_complexity.ipynb b/SqueezeWave/SqueezeWave_computational_complexity.ipynb
new file mode 100644
index 0000000..15865c2
--- /dev/null
+++ b/SqueezeWave/SqueezeWave_computational_complexity.ipynb
@@ -0,0 +1,445 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "SqueezeWave computational complexity.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python2",
+ "display_name": "Python 2"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "s8VYGy15fwqN",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "import numpy as np"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MDp5WalGf5Ji",
+ "colab_type": "text"
+ },
+ "source": [
+ "**WaveGlow**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "wrBBjKSYf89M",
+ "colab_type": "code",
+ "outputId": "4d77bc19-7a81-4f0b-bcad-65c42c4b2e9c",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 136
+ }
+ },
+ "source": [
+ "L = 2048 # audio length\n",
+ "n_audio_channel_init = 8 # initial audio channel \n",
+ "C_mel = 80 * 8 # After upsampling and unfolding \n",
+ "kernal_size = 3\n",
+ "C_wn = 256 # input channel size of in_layer\n",
+ "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
+ "n_flows = 12\n",
+ "n_layers = 8\n",
+ "n_early_output = 2\n",
+ "n_early_output_interval = 4\n",
+ "duration = 0.725\n",
+ "\n",
+ "n_audio_channels = []\n",
+ "n_audio = n_audio_channel_init\n",
+ "for i in range(n_flows):\n",
+ " if i % n_early_output_interval == 0 and i > 0:\n",
+ " n_audio -= n_early_output\n",
+ " n_audio_channels.append(n_audio) # audio channel after early output\n",
+ "\n",
+ "# in_layers\n",
+ "WN_in_layers = L * kernal_size * C_wn * C_wn_middle * n_layers * n_flows\n",
+ "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
+ "# cond layers\n",
+ "WN_cond_layers = L * C_mel * C_wn_middle * n_layers * n_flows \n",
+ "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
+ "# res skip layers\n",
+ "WN_res_layers = (L * C_wn * C_wn_middle * (n_layers - 1) + L * C_wn * C_wn) * n_flows\n",
+ "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
+ "# invertible convs\n",
+ "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
+ "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
+ "# start\n",
+ "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
+ "print('MACs of start conv layers', starts / duration / 1e9)\n",
+ "# end\n",
+ "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
+ "print('MACs of end conv layers', ends / duration / 1e9)\n",
+ "# total\n",
+ "WG_total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
+ "print('Total number of MACs is', WG_total / duration / 1e9)"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "('MACs of in_layers', 106.63367079724138)\n",
+ "('MACs of cond_layers', 88.86139233103448)\n",
+ "('MACs of res_skip_layers', 33.32302212413793)\n",
+ "('MACs of invertible conv layers', 0.00131072)\n",
+ "('MACs of start conv layers', 0.02603361103448276)\n",
+ "('MACs of end conv layers', 0.05206722206896552)\n",
+ "('Total number of MACs is', 228.89749680551725)\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QRQheCWjgC9D",
+ "colab_type": "text"
+ },
+ "source": [
+ "SqueezeWave L=64, C=128"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "zSlwPlvUgJue",
+ "colab_type": "code",
+ "outputId": "18e282ea-a071-4117-ba08-6e6abdc36c68",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 153
+ }
+ },
+ "source": [
+ "L = 64 # audio length\n",
+ "n_audio_channel_init = 256 # initial audio channel \n",
+ "L_mel = 64 # mel-spectrogram length\n",
+ "C_mel =80 # mel-spectrogram channel \n",
+ "kernal_size = 3\n",
+ "C_wn = 128 # input channel size of in_layer\n",
+ "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
+ "n_flows = 12\n",
+ "n_layers = 8\n",
+ "n_early_output = 16\n",
+ "n_early_output_interval = 2\n",
+ "duration = 0.725\n",
+ "\n",
+ "n_audio_channels = []\n",
+ "n_audio = n_audio_channel_init\n",
+ "for i in range(n_flows):\n",
+ " if i % n_early_output_interval == 0 and i > 0:\n",
+ " n_audio -= n_early_output\n",
+ " n_audio_channels.append(n_audio) # audio channel after early output\n",
+ "\n",
+ "# in_layers\n",
+ "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
+ "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
+ "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
+ "# cond_layers\n",
+ "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
+ "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
+ "# res_skip_layers\n",
+ "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
+ "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
+ "# invertible convs\n",
+ "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
+ "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
+ "# start\n",
+ "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
+ "print('MACs of start conv layers', starts / duration / 1e9)\n",
+ "#end\n",
+ "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
+ "print('MACs of end conv layers', ends / duration / 1e9)\n",
+ "# total\n",
+ "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
+ "print('Total number of MACs is', total / duration / 1e9)\n",
+ "print('Reduction compared with WaveGlow', WG_total / total)"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "('MACs of in_layers', 0.2809460524137931)\n",
+ "('MACs of cond_layers', 0.17355740689655172)\n",
+ "('MACs of res_skip_layers', 0.1388459255172414)\n",
+ "('MACs of invertible conv layers', 0.0502141351724138)\n",
+ "('MACs of start conv layers', 0.014643906206896554)\n",
+ "('MACs of end conv layers', 0.029287812413793107)\n",
+ "('Total number of MACs is', 0.6874952386206896)\n",
+ "('Reduction compared with WaveGlow', 332)\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "M6K8zJ6cugYj",
+ "colab_type": "text"
+ },
+ "source": [
+ "**SqueezeWave L=64, C=256**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ju5Xa4oAhScO",
+ "colab_type": "code",
+ "outputId": "c91361be-ff73-4113-a584-6dda74c3690e",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 153
+ }
+ },
+ "source": [
+ "L = 64 # audio length\n",
+ "n_audio_channel_init = 256 # initial audio channel \n",
+ "L_mel = 64 # mel-spectrogram length\n",
+ "C_mel =80 # mel-spectrogram channel \n",
+ "kernal_size = 3\n",
+ "C_wn = 256 # input channel size of in_layer\n",
+ "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
+ "n_flows = 12\n",
+ "n_layers = 8\n",
+ "n_early_output = 16\n",
+ "n_early_output_interval = 2\n",
+ "duration = 0.725\n",
+ "\n",
+ "n_audio_channels = []\n",
+ "n_audio = n_audio_channel_init\n",
+ "for i in range(n_flows):\n",
+ " if i % n_early_output_interval == 0 and i > 0:\n",
+ " n_audio -= n_early_output\n",
+ " n_audio_channels.append(n_audio) # audio channel after early output\n",
+ "\n",
+ "# in_layers\n",
+ "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
+ "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
+ "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
+ "# cond_layers\n",
+ "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
+ "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
+ "# res_skip_layers\n",
+ "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
+ "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
+ "# invertible convs\n",
+ "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
+ "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
+ "# start\n",
+ "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
+ "print('MACs of start conv layers', starts / duration / 1e9)\n",
+ "#end\n",
+ "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
+ "print('MACs of end conv layers', ends / duration / 1e9)\n",
+ "# total\n",
+ "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
+ "print('Total number of MACs is', total / duration / 1e9)\n",
+ "print('Reduction compared with WaveGlow', WG_total / total)"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "('MACs of in_layers', 1.1172758068965518)\n",
+ "('MACs of cond_layers', 0.34711481379310344)\n",
+ "('MACs of res_skip_layers', 0.5553837020689656)\n",
+ "('MACs of invertible conv layers', 0.0502141351724138)\n",
+ "('MACs of start conv layers', 0.029287812413793107)\n",
+ "('MACs of end conv layers', 0.058575624827586215)\n",
+ "('Total number of MACs is', 2.157851895172414)\n",
+ "('Reduction compared with WaveGlow', 106)\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aIgnX6Yi4BFu",
+ "colab_type": "text"
+ },
+ "source": [
+ "**SqueezeWave L=128, C=128**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "W-3Q5jW84F_t",
+ "colab_type": "code",
+ "outputId": "436038c3-f3f8-4989-eeec-eb59c154b183",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 153
+ }
+ },
+ "source": [
+ "L = 128 # audio length\n",
+ "n_audio_channel_init = 128 # initial audio channel \n",
+ "L_mel = 64 # mel-spectrogram length\n",
+ "C_mel =80 # mel-spectrogram channel \n",
+ "kernal_size = 3\n",
+ "C_wn = 128 # input channel size of in_layer\n",
+ "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
+ "n_flows = 12\n",
+ "n_layers = 8\n",
+ "n_early_output = 16\n",
+ "n_early_output_interval = 2\n",
+ "duration = 0.725\n",
+ "\n",
+ "n_audio_channels = []\n",
+ "n_audio = n_audio_channel_init\n",
+ "for i in range(n_flows):\n",
+ " if i % n_early_output_interval == 0 and i > 0:\n",
+ " n_audio -= n_early_output\n",
+ " n_audio_channels.append(n_audio) # audio channel after early output\n",
+ "\n",
+ "# in_layers\n",
+ "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
+ "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
+ "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
+ "# cond_layers\n",
+ "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
+ "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
+ "# res_skip_layers\n",
+ "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
+ "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
+ "# invertible convs\n",
+ "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
+ "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
+ "# start\n",
+ "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
+ "print('MACs of start conv layers', starts / duration / 1e9)\n",
+ "#end\n",
+ "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
+ "print('MACs of end conv layers', ends / duration / 1e9)\n",
+ "# total\n",
+ "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
+ "print('Total number of MACs is', total / duration / 1e9)\n",
+ "print('Reduction compared with WaveGlow', WG_total / total)"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "('MACs of in_layers', 0.5618921048275862)\n",
+ "('MACs of cond_layers', 0.17355740689655172)\n",
+ "('MACs of res_skip_layers', 0.2776918510344828)\n",
+ "('MACs of invertible conv layers', 0.017988502068965517)\n",
+ "('MACs of start conv layers', 0.011932071724137933)\n",
+ "('MACs of end conv layers', 0.023864143448275865)\n",
+ "('Total number of MACs is', 1.06692608)\n",
+ "('Reduction compared with WaveGlow', 214)\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1kWvIBWU4Vwm",
+ "colab_type": "text"
+ },
+ "source": [
+ "**SqueezeWave L=128, C=256**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "6YM2bkC14WWc",
+ "colab_type": "code",
+ "outputId": "b1fd3d03-0135-400e-cfbc-28746c8d0cf0",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 153
+ }
+ },
+ "source": [
+ "L = 128 # audio length\n",
+ "n_audio_channel_init = 128 # initial audio channel \n",
+ "L_mel = 64 # mel-spectrogram length\n",
+ "C_mel =80 # mel-spectrogram channel \n",
+ "kernal_size = 3\n",
+ "C_wn = 256 # input channel size of in_layer\n",
+ "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
+ "n_flows = 12\n",
+ "n_layers = 8\n",
+ "n_early_output = 16\n",
+ "n_early_output_interval = 2\n",
+ "duration = 0.725\n",
+ "\n",
+ "n_audio_channels = []\n",
+ "n_audio = n_audio_channel_init\n",
+ "for i in range(n_flows):\n",
+ " if i % n_early_output_interval == 0 and i > 0:\n",
+ " n_audio -= n_early_output\n",
+ " n_audio_channels.append(n_audio) # audio channel after early output\n",
+ "\n",
+ "# in_layers\n",
+ "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
+ "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
+ "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
+ "# cond_layers\n",
+ "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
+ "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
+ "# res_skip_layers\n",
+ "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
+ "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
+ "# invertible convs\n",
+ "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
+ "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
+ "# start\n",
+ "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
+ "print('MACs of start conv layers', starts / duration / 1e9)\n",
+ "#end\n",
+ "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
+ "print('MACs of end conv layers', ends / duration / 1e9)\n",
+ "# total\n",
+ "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
+ "print('Total number of MACs is', total / duration / 1e9)\n",
+ "print('Reduction compared with WaveGlow', WG_total / total)"
+ ],
+ "execution_count": 0,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "('MACs of in_layers', 2.2345516137931036)\n",
+ "('MACs of cond_layers', 0.34711481379310344)\n",
+ "('MACs of res_skip_layers', 1.1107674041379312)\n",
+ "('MACs of invertible conv layers', 0.017988502068965517)\n",
+ "('MACs of start conv layers', 0.023864143448275865)\n",
+ "('MACs of end conv layers', 0.04772828689655173)\n",
+ "('Total number of MACs is', 3.7820147641379314)\n",
+ "('Reduction compared with WaveGlow', 60)\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/SqueezeWave/TacotronSTFT.py b/SqueezeWave/TacotronSTFT.py
new file mode 100644
index 0000000..ebfe6f8
--- /dev/null
+++ b/SqueezeWave/TacotronSTFT.py
@@ -0,0 +1,80 @@
+import torch
+from librosa.filters import mel as librosa_mel_fn
+from audio_processing import dynamic_range_compression
+from audio_processing import dynamic_range_decompression
+from stft import STFT
+
+
+class LinearNorm(torch.nn.Module):
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+ super(LinearNorm, self).__init__()
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+
+ torch.nn.init.xavier_uniform(
+ self.linear_layer.weight,
+ gain=torch.nn.init.calculate_gain(w_init_gain))
+
+ def forward(self, x):
+ return self.linear_layer(x)
+
+
+class ConvNorm(torch.nn.Module):
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+ padding=None, dilation=1, bias=True, w_init_gain='linear'):
+ super(ConvNorm, self).__init__()
+ if padding is None:
+ assert(kernel_size % 2 == 1)
+ padding = int(dilation * (kernel_size - 1) / 2)
+
+ self.conv = torch.nn.Conv1d(in_channels, out_channels,
+ kernel_size=kernel_size, stride=stride,
+ padding=padding, dilation=dilation,
+ bias=bias)
+
+ torch.nn.init.xavier_uniform(
+ self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+ def forward(self, signal):
+ conv_signal = self.conv(signal)
+ return conv_signal
+
+
+class TacotronSTFT(torch.nn.Module):
+ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
+ n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
+ mel_fmax=None, n_group=256):
+ super(TacotronSTFT, self).__init__()
+ self.n_mel_channels = n_mel_channels
+ self.sampling_rate = sampling_rate
+ self.stft_fn = STFT(filter_length, hop_length, win_length, n_group=n_group)
+ mel_basis = librosa_mel_fn(
+ sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+ mel_basis = torch.from_numpy(mel_basis).float()
+ self.register_buffer('mel_basis', mel_basis)
+
+ def spectral_normalize(self, magnitudes):
+ output = dynamic_range_compression(magnitudes)
+ return output
+
+ def spectral_de_normalize(self, magnitudes):
+ output = dynamic_range_decompression(magnitudes)
+ return output
+
+ def mel_spectrogram(self, y):
+ """Computes mel-spectrograms from a batch of waves
+ PARAMS
+ ------
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+
+ RETURNS
+ -------
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+ """
+ assert(torch.min(y.data) >= -1)
+ assert(torch.max(y.data) <= 1)
+
+ magnitudes, phases = self.stft_fn.transform(y)
+ magnitudes = magnitudes.data
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
+ mel_output = self.spectral_normalize(mel_output)
+ return mel_output
diff --git a/SqueezeWave/audio_processing.py b/SqueezeWave/audio_processing.py
new file mode 100644
index 0000000..b5af7f7
--- /dev/null
+++ b/SqueezeWave/audio_processing.py
@@ -0,0 +1,93 @@
+import torch
+import numpy as np
+from scipy.signal import get_window
+import librosa.util as librosa_util
+
+
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
+ n_fft=800, dtype=np.float32, norm=None):
+ """
+ # from librosa 0.6
+ Compute the sum-square envelope of a window function at a given hop length.
+
+ This is used to estimate modulation effects induced by windowing
+ observations in short-time fourier transforms.
+
+ Parameters
+ ----------
+ window : string, tuple, number, callable, or list-like
+ Window specification, as in `get_window`
+
+ n_frames : int > 0
+ The number of analysis frames
+
+ hop_length : int > 0
+ The number of samples to advance between frames
+
+ win_length : [optional]
+ The length of the window function. By default, this matches `n_fft`.
+
+ n_fft : int > 0
+ The length of each analysis frame.
+
+ dtype : np.dtype
+ The data type of the output
+
+ Returns
+ -------
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+ The sum-squared envelope of the window function
+ """
+ if win_length is None:
+ win_length = n_fft
+
+ n = n_fft + hop_length * (n_frames - 1)
+ x = np.zeros(n, dtype=dtype)
+
+ # Compute the squared window at the desired length
+ win_sq = get_window(window, win_length, fftbins=True)
+ win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
+
+ # Fill the envelope
+ for i in range(n_frames):
+ sample = i * hop_length
+ x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
+ return x
+
+
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+ """
+ PARAMS
+ ------
+ magnitudes: spectrogram magnitudes
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+ """
+
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+ angles = angles.astype(np.float32)
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+
+ for i in range(n_iters):
+ _, angles = stft_fn.transform(signal)
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+ return signal
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+ """
+ PARAMS
+ ------
+ C: compression factor
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+ """
+ PARAMS
+ ------
+ C: compression factor used to compress
+ """
+ return torch.exp(x) / C
diff --git a/SqueezeWave/configs/config_a128_c128.json b/SqueezeWave/configs/config_a128_c128.json
new file mode 100644
index 0000000..1e3273c
--- /dev/null
+++ b/SqueezeWave/configs/config_a128_c128.json
@@ -0,0 +1,40 @@
+{
+ "train_config": {
+ "fp16_run": true,
+ "output_directory": "checkpoints",
+ "epochs": 100000,
+ "learning_rate": 4e-4,
+ "sigma": 1.0,
+ "iters_per_checkpoint": 2000,
+ "batch_size": 96,
+ "seed": 1234,
+ "checkpoint_path": "",
+ "with_tensorboard": true
+ },
+ "data_config": {
+ "training_files": "train_files.txt",
+ "segment_length": 16384,
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "mel_fmin": 0.0,
+ "mel_fmax": 8000.0
+ },
+ "dist_config": {
+ "dist_backend": "nccl",
+ "dist_url": "tcp://localhost:54321"
+ },
+ "squeezewave_config": {
+ "n_mel_channels": 80,
+ "n_flows": 12,
+ "n_audio_channel": 128,
+ "n_early_every": 2,
+ "n_early_size": 16,
+ "WN_config": {
+ "n_layers": 8,
+ "n_channels": 128,
+ "kernel_size": 3
+ }
+ }
+}
diff --git a/SqueezeWave/configs/config_a128_c256.json b/SqueezeWave/configs/config_a128_c256.json
new file mode 100644
index 0000000..c342590
--- /dev/null
+++ b/SqueezeWave/configs/config_a128_c256.json
@@ -0,0 +1,40 @@
+{
+ "train_config": {
+ "fp16_run": true,
+ "output_directory": "checkpoints",
+ "epochs": 100000,
+ "learning_rate": 4e-4,
+ "sigma": 1.0,
+ "iters_per_checkpoint": 2000,
+ "batch_size": 96,
+ "seed": 1234,
+ "checkpoint_path": "checkpoints/Squeeze_244000",
+ "with_tensorboard": true
+ },
+ "data_config": {
+ "training_files": "train_files.txt",
+ "segment_length": 16384,
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "mel_fmin": 0.0,
+ "mel_fmax": 8000.0
+ },
+ "dist_config": {
+ "dist_backend": "nccl",
+ "dist_url": "tcp://localhost:54321"
+ },
+ "squeezewave_config": {
+ "n_mel_channels": 80,
+ "n_flows": 12,
+ "n_audio_channel": 128,
+ "n_early_every": 2,
+ "n_early_size": 16,
+ "WN_config": {
+ "n_layers": 8,
+ "n_channels": 256,
+ "kernel_size": 3
+ }
+ }
+}
diff --git a/SqueezeWave/configs/config_a256_c128.json b/SqueezeWave/configs/config_a256_c128.json
new file mode 100644
index 0000000..29d8f26
--- /dev/null
+++ b/SqueezeWave/configs/config_a256_c128.json
@@ -0,0 +1,40 @@
+{
+ "train_config": {
+ "fp16_run": true,
+ "output_directory": "checkpoints",
+ "epochs": 100000,
+ "learning_rate": 4e-4,
+ "sigma": 1.0,
+ "iters_per_checkpoint": 2000,
+ "batch_size": 96,
+ "seed": 1234,
+ "checkpoint_path": "",
+ "with_tensorboard": true
+ },
+ "data_config": {
+ "training_files": "train_files.txt",
+ "segment_length": 16384,
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "mel_fmin": 0.0,
+ "mel_fmax": 8000.0
+ },
+ "dist_config": {
+ "dist_backend": "nccl",
+ "dist_url": "tcp://localhost:54321"
+ },
+ "squeezewave_config": {
+ "n_mel_channels": 80,
+ "n_flows": 12,
+ "n_audio_channel": 256,
+ "n_early_every": 2,
+ "n_early_size": 16,
+ "WN_config": {
+ "n_layers": 8,
+ "n_channels": 128,
+ "kernel_size": 3
+ }
+ }
+}
diff --git a/SqueezeWave/configs/config_a256_c256.json b/SqueezeWave/configs/config_a256_c256.json
new file mode 100644
index 0000000..5ce633a
--- /dev/null
+++ b/SqueezeWave/configs/config_a256_c256.json
@@ -0,0 +1,40 @@
+{
+ "train_config": {
+ "fp16_run": true,
+ "output_directory": "checkpoints",
+ "epochs": 100000,
+ "learning_rate": 4e-4,
+ "sigma": 1.0,
+ "iters_per_checkpoint": 2000,
+ "batch_size": 96,
+ "seed": 1234,
+ "checkpoint_path": "",
+ "with_tensorboard": true
+ },
+ "data_config": {
+ "training_files": "train_files.txt",
+ "segment_length": 16384,
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "mel_fmin": 0.0,
+ "mel_fmax": 8000.0
+ },
+ "dist_config": {
+ "dist_backend": "nccl",
+ "dist_url": "tcp://localhost:54321"
+ },
+ "squeezewave_config": {
+ "n_mel_channels": 80,
+ "n_flows": 12,
+ "n_audio_channel": 256,
+ "n_early_every": 2,
+ "n_early_size": 16,
+ "WN_config": {
+ "n_layers": 8,
+ "n_channels": 256,
+ "kernel_size": 3
+ }
+ }
+}
diff --git a/SqueezeWave/convert_model.py b/SqueezeWave/convert_model.py
new file mode 100644
index 0000000..b725a17
--- /dev/null
+++ b/SqueezeWave/convert_model.py
@@ -0,0 +1,70 @@
+import sys
+import copy
+import torch
+
+def _check_model_old_version(model):
+ if hasattr(model.WN[0], 'res_layers') or hasattr(model.WN[0], 'cond_layers'):
+ return True
+ else:
+ return False
+
+def _update_model_res_skip(old_model, new_model):
+ for idx in range(0, len(new_model.WN)):
+ wavenet = new_model.WN[idx]
+ n_channels = wavenet.n_channels
+ n_layers = wavenet.n_layers
+ wavenet.res_skip_layers = torch.nn.ModuleList()
+ for i in range(0, n_layers):
+ if i < n_layers - 1:
+ res_skip_channels = 2*n_channels
+ else:
+ res_skip_channels = n_channels
+ res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+ skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i])
+ if i < n_layers - 1:
+ res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i])
+ res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight]))
+ res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias]))
+ else:
+ res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight)
+ res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ wavenet.res_skip_layers.append(res_skip_layer)
+ del wavenet.res_layers
+ del wavenet.skip_layers
+
+def _update_model_cond(old_model, new_model):
+ for idx in range(0, len(new_model.WN)):
+ wavenet = new_model.WN[idx]
+ n_channels = wavenet.n_channels
+ n_layers = wavenet.n_layers
+ n_mel_channels = wavenet.cond_layers[0].weight.shape[1]
+ cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
+ cond_layer_weight = []
+ cond_layer_bias = []
+ for i in range(0, n_layers):
+ _cond_layer = torch.nn.utils.remove_weight_norm(wavenet.cond_layers[i])
+ cond_layer_weight.append(_cond_layer.weight)
+ cond_layer_bias.append(_cond_layer.bias)
+ cond_layer.weight = torch.nn.Parameter(torch.cat(cond_layer_weight))
+ cond_layer.bias = torch.nn.Parameter(torch.cat(cond_layer_bias))
+ cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+ wavenet.cond_layer = cond_layer
+ del wavenet.cond_layers
+
+def update_model(old_model):
+ if not _check_model_old_version(old_model):
+ return old_model
+ new_model = copy.deepcopy(old_model)
+ if hasattr(old_model.WN[0], 'res_layers'):
+ _update_model_res_skip(old_model, new_model)
+ if hasattr(old_model.WN[0], 'cond_layers'):
+ _update_model_cond(old_model, new_model)
+ return new_model
+
+if __name__ == '__main__':
+ old_model_path = sys.argv[1]
+ new_model_path = sys.argv[2]
+ model = torch.load(old_model_path)
+ model['model'] = update_model(model['model'])
+ torch.save(model, new_model_path)
diff --git a/SqueezeWave/denoiser.py b/SqueezeWave/denoiser.py
new file mode 100644
index 0000000..8f9ff57
--- /dev/null
+++ b/SqueezeWave/denoiser.py
@@ -0,0 +1,39 @@
+import sys
+import torch
+from stft import STFT
+
+
+class Denoiser(torch.nn.Module):
+ """ Removes model bias from audio produced with squeezewave"""
+
+ def __init__(self, squeezewave, filter_length=1024, n_overlap=4,
+ win_length=1024, mode='zeros'):
+ super(Denoiser, self).__init__()
+ self.stft = STFT(filter_length=filter_length,
+ hop_length=int(filter_length/n_overlap),
+ win_length=win_length).cuda()
+ if mode == 'zeros':
+ mel_input = torch.zeros(
+ (1, 80, 88),
+ dtype=squeezewave.upsample.weight.dtype,
+ device=squeezewave.upsample.weight.device)
+ elif mode == 'normal':
+ mel_input = torch.randn(
+ (1, 80, 88),
+ dtype=squeezewave.upsample.weight.dtype,
+ device=squeezewave.upsample.weight.device)
+ else:
+ raise Exception("Mode {} if not supported".format(mode))
+
+ with torch.no_grad():
+ bias_audio = squeezewave.infer(mel_input, sigma=0.0).float()
+ bias_spec, _ = self.stft.transform(bias_audio)
+
+ self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
+
+ def forward(self, audio, strength=0.1):
+ audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
+ audio_spec_denoised = audio_spec - self.bias_spec * strength
+ audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
+ audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
+ return audio_denoised
diff --git a/SqueezeWave/distributed.py b/SqueezeWave/distributed.py
new file mode 100644
index 0000000..e1b9f55
--- /dev/null
+++ b/SqueezeWave/distributed.py
@@ -0,0 +1,191 @@
+# We retain the copyright notice by NVIDIA from the original code. However, we
+# we reserve our rights on the modifications based on the original code.
+#
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import os
+import sys
+import time
+import subprocess
+import argparse
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Variable
+
+def reduce_tensor(tensor, num_gpus):
+ rt = tensor.clone()
+ dist.all_reduce(rt, op=dist.reduce_op.SUM)
+# rt /= (num_gpus*2)
+ rt /=num_gpus
+ return rt
+
+def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
+ assert torch.cuda.is_available(), "Distributed mode requires CUDA."
+ print("Initializing Distributed")
+
+ # Set cuda device so everything is done on the right GPU.
+ torch.cuda.set_device(rank % torch.cuda.device_count())
+
+# os.environ['MASTER_ADDR'] = '172.31.44.232'
+# os.environ['MASTER_PORT'] = '58217'
+ # Initialize distributed communication
+ dist.init_process_group(dist_backend, init_method=dist_url,
+ world_size=num_gpus, rank=rank,
+ group_name=group_name)
+
+def _flatten_dense_tensors(tensors):
+ """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+ same dense type.
+ Since inputs are dense, the resulting tensor will be a concatenated 1D
+ buffer. Element-wise operation on this buffer will be equivalent to
+ operating individually.
+ Arguments:
+ tensors (Iterable[Tensor]): dense tensors to flatten.
+ Returns:
+ A contiguous 1D buffer containing input tensors.
+ """
+ if len(tensors) == 1:
+ return tensors[0].contiguous().view(-1)
+ flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
+ return flat
+
+def _unflatten_dense_tensors(flat, tensors):
+ """View a flat buffer using the sizes of tensors. Assume that tensors are of
+ same dense type, and that flat is given by _flatten_dense_tensors.
+ Arguments:
+ flat (Tensor): flattened dense tensors to unflatten.
+ tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+ unflatten flat.
+ Returns:
+ Unflattened dense tensors with sizes same as tensors and values from
+ flat.
+ """
+ outputs = []
+ offset = 0
+ for tensor in tensors:
+ numel = tensor.numel()
+ outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
+ offset += numel
+ return tuple(outputs)
+
+def apply_gradient_allreduce(module):
+ """
+ Modifies existing model to do gradient allreduce, but doesn't change class
+ so you don't need "module"
+ """
+ if not hasattr(dist, '_backend'):
+ module.warn_on_half = True
+ else:
+ module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+ for p in module.state_dict().values():
+ if not torch.is_tensor(p):
+ continue
+ dist.broadcast(p, 0)
+
+ def allreduce_params():
+ if(module.needs_reduction):
+ module.needs_reduction = False
+ buckets = {}
+ for param in module.parameters():
+ if param.requires_grad and param.grad is not None:
+ tp = type(param.data)
+ if tp not in buckets:
+ buckets[tp] = []
+ buckets[tp].append(param)
+ if module.warn_on_half:
+ if torch.cuda.HalfTensor in buckets:
+ print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+ " It is recommended to use the NCCL backend in this case. This currently requires" +
+ "PyTorch built from top of tree master.")
+ module.warn_on_half = False
+
+ for tp in buckets:
+ bucket = buckets[tp]
+ grads = [param.grad.data for param in bucket]
+ coalesced = _flatten_dense_tensors(grads)
+ dist.all_reduce(coalesced)
+ coalesced /= dist.get_world_size()
+ for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+ buf.copy_(synced)
+
+ for param in list(module.parameters()):
+ def allreduce_hook(*unused):
+ Variable._execution_engine.queue_callback(allreduce_params)
+ if param.requires_grad:
+ param.register_hook(allreduce_hook)
+ dir(param)
+
+ def set_needs_reduction(self, input, output):
+ self.needs_reduction = True
+
+ module.register_forward_hook(set_needs_reduction)
+ return module
+
+
+def main(config, stdout_dir, args_str):
+ args_list = ['-u']
+ args_list.append('train.py')
+ args_list += args_str.split(' ') if len(args_str) > 0 else []
+
+ args_list.append('--config={}'.format(config))
+
+ num_gpus = torch.cuda.device_count()
+ args_list.append('--num_gpus={}'.format(num_gpus))
+ args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S")))
+
+ if not os.path.isdir(stdout_dir):
+ os.makedirs(stdout_dir)
+ os.chmod(stdout_dir, 0o775)
+
+ workers = []
+
+ for i in range(num_gpus):
+ args_list[-2] = '--rank={}'.format(i)
+ stdout = None if i == 0 else open(
+ os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w")
+ print(args_list)
+ p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout)
+ workers.append(p)
+
+ for p in workers:
+ p.wait()
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--config', type=str, required=True,
+ help='JSON file for configuration')
+ parser.add_argument('-s', '--stdout_dir', type=str, default=".",
+ help='directory to save stoud logs')
+ parser.add_argument(
+ '-a', '--args_str', type=str, default='',
+ help='double quoted string with space separated key value pairs')
+
+ args = parser.parse_args()
+ main(args.config, args.stdout_dir, args.args_str)
diff --git a/SqueezeWave/glow.py b/SqueezeWave/glow.py
new file mode 100644
index 0000000..f692103
--- /dev/null
+++ b/SqueezeWave/glow.py
@@ -0,0 +1,328 @@
+# We retain the copyright notice by NVIDIA from the original code. However, we
+# we reserve our rights on the modifications based on the original code.
+#
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+import numpy as np
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a+input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+class Upsample1d(torch.nn.Module):
+ def __init__(self, scale=2):
+ super(Upsample1d, self).__init__()
+ self.scale = scale
+
+ def forward(self, x):
+ y = F.interpolate(
+ x, scale_factor=self.scale, mode='nearest')
+ return y
+
+
+class SqueezeWaveLoss(torch.nn.Module):
+ def __init__(self, sigma=1.0):
+ super(SqueezeWaveLoss, self).__init__()
+ self.sigma = sigma
+
+ def forward(self, model_output):
+ z, log_s_list, log_det_W_list = model_output
+ for i, log_s in enumerate(log_s_list):
+ if i == 0:
+ log_s_total = torch.sum(log_s)
+ log_det_W_total = log_det_W_list[i]
+ else:
+ log_s_total = log_s_total + torch.sum(log_s)
+ log_det_W_total += log_det_W_list[i]
+
+ loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
+ return loss/(z.size(0)*z.size(1)*z.size(2))
+
+
+class Invertible1x1Conv(torch.nn.Module):
+ """
+ The layer outputs both the convolution, and the log determinant
+ of its weight matrix. If reverse=True it does convolution with
+ inverse
+ """
+ def __init__(self, c):
+ super(Invertible1x1Conv, self).__init__()
+ self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
+ bias=False)
+
+ # Sample a random orthonormal matrix to initialize weights
+ W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+
+ # Ensure determinant is 1.0 not -1.0
+ if torch.det(W) < 0:
+ W[:,0] = -1*W[:,0]
+ W = W.view(c, c, 1)
+ self.conv.weight.data = W
+
+ def forward(self, z, reverse=False):
+ # shape
+ batch_size, group_size, n_of_groups = z.size()
+ W = self.conv.weight.squeeze()
+
+ if reverse:
+ if not hasattr(self, 'W_inverse'):
+ # Reverse computation
+ W_inverse = W.float().inverse()
+ W_inverse = Variable(W_inverse[..., None])
+ if z.type() == 'torch.cuda.HalfTensor':
+ W_inverse = W_inverse.half()
+ self.W_inverse = W_inverse
+ z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+ return z
+ else:
+ # Forward computation
+ log_det_W = batch_size * n_of_groups * torch.logdet(W)
+ z = self.conv(z)
+ return z, log_det_W
+
+
+class WN(torch.nn.Module):
+ """
+ This is the WaveNet like layer for the affine coupling. The primary difference
+ from WaveNet is the convolutions need not be causal. There is also no dilation
+ size reset. The dilation only doubles on each layer
+ """
+ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+ kernel_size):
+ super(WN, self).__init__()
+ assert(kernel_size % 2 == 1)
+ assert(n_channels % 2 == 0)
+ self.n_layers = n_layers
+ self.n_channels = n_channels
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.upsample = Upsample1d(2)
+ start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+ start = torch.nn.utils.weight_norm(start, name='weight')
+ self.start = start
+ end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
+ end.weight.data.zero_()
+ end.bias.data.zero_()
+ self.end = end
+
+ # cond_layer
+ cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+ for i in range(n_layers):
+ dilation = 1
+ padding = int((kernel_size*dilation - dilation)/2)
+ # depthwise separable convolution
+ depthwise = torch.nn.Conv1d(n_channels, n_channels, 3,
+ dilation=dilation, padding=padding,
+ groups=n_channels).cuda()
+ pointwise = torch.nn.Conv1d(n_channels, 2*n_channels, 1).cuda()
+ bn = torch.nn.BatchNorm1d(n_channels)
+ self.in_layers.append(torch.nn.Sequential(bn, depthwise, pointwise))
+ # res_skip_layer
+ res_skip_layer = torch.nn.Conv1d(n_channels, n_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, forward_input):
+ audio, spect = forward_input
+ audio = self.start(audio)
+ n_channels_tensor = torch.IntTensor([self.n_channels])
+ # pass all the mel_spectrograms to cond_layer
+ spect = self.cond_layer(spect)
+ for i in range(self.n_layers):
+ # split the corresponding mel_spectrogram
+ spect_offset = i*2*self.n_channels
+ spec = spect[:,spect_offset:spect_offset+2*self.n_channels,:]
+ if audio.size(2) > spec.size(2):
+ cond = self.upsample(spec)
+ else:
+ cond = spec
+ acts = fused_add_tanh_sigmoid_multiply(
+ self.in_layers[i](audio),
+ cond,
+ n_channels_tensor)
+ # res_skip
+ res_skip_acts = self.res_skip_layers[i](acts)
+ audio = audio + res_skip_acts
+ return self.end(audio)
+
+
+class SqueezeWave(torch.nn.Module):
+ def __init__(self, n_mel_channels, n_flows, n_audio_channel, n_early_every,
+ n_early_size, WN_config):
+ super(SqueezeWave, self).__init__()
+ assert(n_audio_channel % 2 == 0)
+ self.n_flows = n_flows
+ self.n_audio_channel = n_audio_channel
+ self.n_early_every = n_early_every
+ self.n_early_size = n_early_size
+ self.WN = torch.nn.ModuleList()
+ self.convinv = torch.nn.ModuleList()
+
+ n_half = int(n_audio_channel / 2)
+ # Set up layers with the right sizes based on how many dimensions
+ # have been output already
+ n_remaining_channels = n_audio_channel
+ for k in range(n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ n_half = n_half - int(self.n_early_size/2)
+ n_remaining_channels = n_remaining_channels - self.n_early_size
+ self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+ self.WN.append(WN(n_half, n_mel_channels, **WN_config))
+
+ self.n_remaining_channels = n_remaining_channels # Useful during inference
+
+ def forward(self, forward_input):
+ """
+ forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
+ forward_input[1] = audio: batch x time
+ """
+ spect, audio = forward_input
+
+ audio = audio.unfold(
+ 1, self.n_audio_channel, self.n_audio_channel).permute(0, 2, 1)
+ output_audio = []
+ log_s_list = []
+ log_det_W_list = []
+
+ for k in range(self.n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ output_audio.append(audio[:,:self.n_early_size,:])
+ audio = audio[:,self.n_early_size:,:]
+
+ audio, log_det_W = self.convinv[k](audio)
+ log_det_W_list.append(log_det_W)
+
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:,:n_half,:]
+ audio_1 = audio[:,n_half:,:]
+
+ output = self.WN[k]((audio_0, spect))
+ log_s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+
+ audio_1 = (torch.exp(log_s))*audio_1 + b
+ log_s_list.append(log_s)
+ audio = torch.cat([audio_0, audio_1], 1)
+
+ output_audio.append(audio)
+ return torch.cat(output_audio, 1), log_s_list, log_det_W_list
+
+ def infer(self, spect, sigma=1.0):
+ spect_size = spect.size()
+ l = spect.size(2)*(256 // self.n_audio_channel)
+ if spect.type() == 'torch.cuda.HalfTensor':
+ audio = torch.cuda.HalfTensor(spect.size(0),
+ self.n_remaining_channels,
+ l).normal_()
+ else:
+ audio = torch.cuda.FloatTensor(spect.size(0),
+ self.n_remaining_channels,
+ l).normal_()
+
+ for k in reversed(range(self.n_flows)):
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:,:n_half,:]
+ audio_1 = audio[:,n_half:,:]
+ output = self.WN[k]((audio_0, spect))
+
+ s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+ audio_1 = (audio_1 - b)/torch.exp(s)
+ audio = torch.cat([audio_0, audio_1],1)
+
+ audio = self.convinv[k](audio, reverse=True)
+
+ if k % self.n_early_every == 0 and k > 0:
+ if spect.type() == 'torch.cuda.HalfTensor':
+ z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, l).normal_()
+ else:
+ z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, l).normal_()
+ audio = torch.cat((sigma*z, audio),1)
+
+ audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
+ return audio
+
+
+ @staticmethod
+ def remove_weightnorm(model):
+ squeezewave = model
+ for WN in squeezewave.WN:
+ WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+ WN.in_layers = remove_batch_norm(WN.in_layers)
+ WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer)
+ WN.res_skip_layers = remove(WN.res_skip_layers)
+ return squeezewave
+
+def fuse_conv_and_bn(conv, bn):
+ fusedconv = torch.nn.Conv1d(
+ conv.in_channels,
+ conv.out_channels,
+ kernel_size = conv.kernel_size,
+ padding=conv.padding,
+ bias=True,
+ groups=conv.groups)
+ w_conv = conv.weight.clone().view(conv.out_channels, -1)
+ w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps+bn.running_var)))
+ w_bn = w_bn.clone()
+ fusedconv.weight.data = torch.mm(w_bn, w_conv).view(fusedconv.weight.size())
+ if conv.bias is not None:
+ b_conv = conv.bias
+ else:
+ b_conv = torch.zeros( conv.weight.size(0) )
+ b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
+ b_bn = torch.unsqueeze(b_bn, 1)
+ bn_3 = b_bn.expand(-1, 3)
+ b = torch.matmul(w_conv, torch.transpose(bn_3, 0, 1))[range(b_bn.size()[0]), range(b_bn.size()[0])]
+ fusedconv.bias.data = ( b_conv + b )
+ return fusedconv
+
+def remove_batch_norm(conv_list):
+ new_conv_list = torch.nn.ModuleList()
+ for old_conv in conv_list:
+ depthwise = fuse_conv_and_bn(old_conv[1], old_conv[0])
+ pointwise = old_conv[2]
+ new_conv_list.append(torch.nn.Sequential(depthwise, pointwise))
+ return new_conv_list
+
+def remove(conv_list):
+ new_conv_list = torch.nn.ModuleList()
+ for old_conv in conv_list:
+ old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+ new_conv_list.append(old_conv)
+ return new_conv_list
+
diff --git a/SqueezeWave/inference.py b/SqueezeWave/inference.py
new file mode 100644
index 0000000..568e6ce
--- /dev/null
+++ b/SqueezeWave/inference.py
@@ -0,0 +1,87 @@
+# We retain the copyright notice by NVIDIA from the original code. However, we
+# we reserve our rights on the modifications based on the original code.
+#
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import os
+from scipy.io.wavfile import write
+import torch
+from mel2samp import files_to_list, MAX_WAV_VALUE
+from denoiser import Denoiser
+
+
+def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16,
+ denoiser_strength):
+ mel_files = files_to_list(mel_files)
+ squeezewave = torch.load(squeezewave_path)['model']
+ squeezewave = squeezewave.remove_weightnorm(squeezewave)
+ squeezewave.cuda().eval()
+ if is_fp16:
+ from apex import amp
+ squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3")
+
+ if denoiser_strength > 0:
+ denoiser = Denoiser(squeezewave).cuda()
+
+ for i, file_path in enumerate(mel_files):
+ file_name = os.path.splitext(os.path.basename(file_path))[0]
+ mel = torch.load(file_path)
+ mel = torch.autograd.Variable(mel.cuda())
+ mel = torch.unsqueeze(mel, 0)
+ mel = mel.half() if is_fp16 else mel
+ with torch.no_grad():
+ audio = squeezewave.infer(mel, sigma=sigma).float()
+ if denoiser_strength > 0:
+ audio = denoiser(audio, denoiser_strength)
+ audio = audio * MAX_WAV_VALUE
+ audio = audio.squeeze()
+ audio = audio.cpu().numpy()
+ audio = audio.astype('int16')
+ audio_path = os.path.join(
+ output_dir, "{}_synthesis.wav".format(file_name))
+ write(audio_path, sampling_rate, audio)
+ print(audio_path)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-f', "--filelist_path", required=True)
+ parser.add_argument('-w', '--squeezewave_path',
+ help='Path to squeezewave decoder checkpoint with model')
+ parser.add_argument('-o', "--output_dir", required=True)
+ parser.add_argument("-s", "--sigma", default=1.0, type=float)
+ parser.add_argument("--sampling_rate", default=22050, type=int)
+ parser.add_argument("--is_fp16", action="store_true")
+ parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float,
+ help='Removes model bias. Start with 0.1 and adjust')
+
+ args = parser.parse_args()
+
+ main(args.filelist_path, args.squeezewave_path, args.sigma, args.output_dir,
+ args.sampling_rate, args.is_fp16, args.denoiser_strength)
diff --git a/SqueezeWave/mel2samp.py b/SqueezeWave/mel2samp.py
new file mode 100644
index 0000000..0419202
--- /dev/null
+++ b/SqueezeWave/mel2samp.py
@@ -0,0 +1,150 @@
+# We retain the copyright notice by NVIDIA from the original code. However, we
+# we reserve our rights on the modifications based on the original code.
+#
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************\
+import os
+import random
+import argparse
+import json
+import torch
+import torch.utils.data
+import sys
+from scipy.io.wavfile import read
+
+# We're using the audio processing from TacoTron2 to make sure it matches
+from TacotronSTFT import TacotronSTFT
+
+MAX_WAV_VALUE = 32768.0
+
+def files_to_list(filename):
+ """
+ Takes a text file of filenames and makes a list of filenames
+ """
+ with open(filename, encoding='utf-8') as f:
+ files = f.readlines()
+
+ files = [f.rstrip() for f in files]
+ return files
+
+def load_wav_to_torch(full_path):
+ """
+ Loads wavdata into torch array
+ """
+ sampling_rate, data = read(full_path)
+ return torch.from_numpy(data).float(), sampling_rate
+
+
+class Mel2Samp(torch.utils.data.Dataset):
+ """
+ This is the main class that calculates the spectrogram and returns the
+ spectrogram, audio pair.
+ """
+ def __init__(self, n_audio_channel, training_files, segment_length,
+ filter_length, hop_length, win_length, sampling_rate, mel_fmin,
+ mel_fmax):
+ self.audio_files = files_to_list(training_files)
+ random.seed(1234)
+ random.shuffle(self.audio_files)
+ self.stft = TacotronSTFT(filter_length=filter_length,
+ hop_length=hop_length,
+ win_length=win_length,
+ sampling_rate=sampling_rate,
+ mel_fmin=mel_fmin, mel_fmax=mel_fmax,
+ n_group=n_audio_channel)
+ self.segment_length = segment_length
+ self.sampling_rate = sampling_rate
+
+ def get_mel(self, audio):
+ audio_norm = audio / MAX_WAV_VALUE
+ audio_norm = audio_norm.unsqueeze(0)
+ audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+ melspec = self.stft.mel_spectrogram(audio_norm)
+ melspec = torch.squeeze(melspec, 0)
+ return melspec
+
+ def __getitem__(self, index):
+ # Read audio
+ filename = self.audio_files[index]
+ audio, sampling_rate = load_wav_to_torch(filename)
+ if sampling_rate != self.sampling_rate:
+ raise ValueError("{} SR doesn't match target {} SR".format(
+ sampling_rate, self.sampling_rate))
+
+ # Take segment
+ if audio.size(0) >= self.segment_length:
+ max_audio_start = audio.size(0) - self.segment_length
+ audio_start = random.randint(0, max_audio_start)
+ audio = audio[audio_start:audio_start+self.segment_length]
+ else:
+ audio = torch.nn.functional.pad(
+ audio, (0, self.segment_length - audio.size(0)),
+ 'constant').data
+
+ mel = self.get_mel(audio)
+ audio = audio / MAX_WAV_VALUE
+
+ return (mel, audio)
+
+ def __len__(self):
+ return len(self.audio_files)
+
+# ===================================================================
+# Takes directory of clean audio and makes directory of spectrograms
+# Useful for making test sets
+# ===================================================================
+if __name__ == "__main__":
+ # Get defaults so it can work with no Sacred
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-f', "--filelist_path", required=True)
+ parser.add_argument('-c', '--config', type=str,
+ help='JSON file for configuration')
+ parser.add_argument('-o', '--output_dir', type=str,
+ help='Output directory')
+ args = parser.parse_args()
+
+ with open(args.config) as f:
+ data = f.read()
+ config = json.loads(data)
+ data_config = config["data_config"]
+ squeezewave_config = config["squeezewave_config"]
+ mel2samp = Mel2Samp(squeezewave_config['n_audio_channel'], **data_config)
+
+ filepaths = files_to_list(args.filelist_path)
+
+ # Make directory if it doesn't exist
+ if not os.path.isdir(args.output_dir):
+ os.makedirs(args.output_dir)
+ os.chmod(args.output_dir, 0o775)
+
+ for filepath in filepaths:
+ audio, sr = load_wav_to_torch(filepath)
+ melspectrogram = mel2samp.get_mel(audio)
+ filename = os.path.basename(filepath)
+ new_filepath = args.output_dir + '/' + filename + '.pt'
+ print(new_filepath)
+ torch.save(melspectrogram, new_filepath)
diff --git a/SqueezeWave/requirements.txt b/SqueezeWave/requirements.txt
new file mode 100644
index 0000000..cf54599
--- /dev/null
+++ b/SqueezeWave/requirements.txt
@@ -0,0 +1,8 @@
+torch==1.0
+matplotlib==2.1.0
+numpy==1.13.3
+inflect==0.2.5
+librosa==0.6.0
+scipy==1.0.0
+tensorboardX==1.1
+Unidecode==1.0.22
diff --git a/SqueezeWave/stft.py b/SqueezeWave/stft.py
new file mode 100644
index 0000000..98b56e5
--- /dev/null
+++ b/SqueezeWave/stft.py
@@ -0,0 +1,147 @@
+"""
+We retain the copyright notice from the original author. However, we reserve
+our rights on the modifications based on the original code.
+
+BSD 3-Clause License
+
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+
+* Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+ list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from audio_processing import window_sumsquare
+
+
+class STFT(torch.nn.Module):
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+ def __init__(self, filter_length=800, hop_length=200, win_length=800,
+ window='hann', n_group=256):
+ super(STFT, self).__init__()
+ self.filter_length = filter_length
+ self.hop_length = hop_length
+ self.win_length = win_length
+ self.window = window
+ self.forward_transform = None
+ self.n_group = n_group
+ scale = self.filter_length / self.hop_length
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
+
+ cutoff = int((self.filter_length / 2 + 1))
+ fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
+ np.imag(fourier_basis[:cutoff, :])])
+
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+ inverse_basis = torch.FloatTensor(
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :])
+
+ if window is not None:
+ assert(win_length >= filter_length)
+ # get window and zero center pad it to filter_length
+ fft_window = get_window(window, win_length, fftbins=True)
+ fft_window = pad_center(fft_window, filter_length)
+ fft_window = torch.from_numpy(fft_window).float()
+
+ # window the bases
+ forward_basis *= fft_window
+ inverse_basis *= fft_window
+
+ self.register_buffer('forward_basis', forward_basis.float())
+ self.register_buffer('inverse_basis', inverse_basis.float())
+
+ def transform(self, input_data):
+ num_batches = input_data.size(0)
+ num_samples = input_data.size(1)
+
+ self.num_samples = num_samples
+
+ # similar to librosa, reflect-pad the input
+ input_data = input_data.view(num_batches, 1, num_samples)
+ pad = ((64 - 1) * self.hop_length + self.filter_length - num_samples) // 2
+ if pad < 0:
+ pad = self.filter_length // 2
+ input_data = F.pad(
+ input_data.unsqueeze(1),
+ (int(pad), int(pad), 0, 0),
+ mode='reflect')
+ input_data = input_data.squeeze(1)
+
+ forward_transform = F.conv1d(
+ input_data,
+ Variable(self.forward_basis, requires_grad=False),
+ stride=self.hop_length,
+ padding=0)
+
+ cutoff = int((self.filter_length / 2) + 1)
+ real_part = forward_transform[:, :cutoff, :]
+ imag_part = forward_transform[:, cutoff:, :]
+
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
+ phase = torch.autograd.Variable(
+ torch.atan2(imag_part.data, real_part.data))
+
+ return magnitude, phase
+
+ def inverse(self, magnitude, phase):
+ recombine_magnitude_phase = torch.cat(
+ [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
+
+ inverse_transform = F.conv_transpose1d(
+ recombine_magnitude_phase,
+ Variable(self.inverse_basis, requires_grad=False),
+ stride=self.hop_length,
+ padding=0)
+
+ if self.window is not None:
+ window_sum = window_sumsquare(
+ self.window, magnitude.size(-1), hop_length=self.hop_length,
+ win_length=self.win_length, n_fft=self.filter_length,
+ dtype=np.float32)
+ # remove modulation effects
+ approx_nonzero_indices = torch.from_numpy(
+ np.where(window_sum > tiny(window_sum))[0])
+ window_sum = torch.autograd.Variable(
+ torch.from_numpy(window_sum), requires_grad=False)
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+
+ # scale by hop ratio
+ inverse_transform *= float(self.filter_length) / self.hop_length
+
+ inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
+ inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
+
+ return inverse_transform
+
+ def forward(self, input_data):
+ self.magnitude, self.phase = self.transform(input_data)
+ reconstruction = self.inverse(self.magnitude, self.phase)
+ return reconstruction
diff --git a/SqueezeWave/train.py b/SqueezeWave/train.py
new file mode 100644
index 0000000..7cbc2e1
--- /dev/null
+++ b/SqueezeWave/train.py
@@ -0,0 +1,203 @@
+# We retain the copyright notice by NVIDIA from the original code. However, we
+# we reserve our rights on the modifications based on the original code.
+#
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import argparse
+import json
+import os
+import torch
+
+#=====START: ADDED FOR DISTRIBUTED======
+from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor
+from torch.utils.data.distributed import DistributedSampler
+#=====END: ADDED FOR DISTRIBUTED======
+
+from torch.utils.data import DataLoader
+from glow import SqueezeWave, SqueezeWaveLoss
+from mel2samp import Mel2Samp
+
+def load_checkpoint(
+ checkpoint_path, model, optimizer, n_flows, n_early_every,
+ n_early_size, n_mel_channels, n_audio_channel, WN_config):
+
+ assert os.path.isfile(checkpoint_path)
+
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+ iteration = checkpoint_dict['iteration']
+ #iteration = 1
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
+ model_for_loading = checkpoint_dict['model']
+ state_dict = model_for_loading.state_dict()
+
+ model.load_state_dict(state_dict, strict = False)
+ print("Loaded checkpoint '{}' (iteration {})" .format(checkpoint_path, iteration))
+
+ return model, optimizer, iteration
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
+ print("Saving model and optimizer state at iteration {} to {}".format(
+ iteration, filepath))
+ model_for_saving = SqueezeWave(**squeezewave_config).cuda()
+ model_for_saving.load_state_dict(model.state_dict())
+ torch.save({'model': model_for_saving,
+ 'iteration': iteration,
+ 'optimizer': optimizer.state_dict(),
+ 'learning_rate': learning_rate}, filepath)
+
+def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
+ sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
+ checkpoint_path, with_tensorboard):
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed(seed)
+ #=====START: ADDED FOR DISTRIBUTED======
+ if num_gpus > 1:
+ init_distributed(rank, num_gpus, group_name, **dist_config)
+ #=====END: ADDED FOR DISTRIBUTED======
+
+ criterion = SqueezeWaveLoss(sigma)
+ model = SqueezeWave(**squeezewave_config).cuda()
+ print(model)
+ pytorch_total_params = sum(p.numel() for p in model.parameters())
+ pytorch_total_params_train = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ print("param", pytorch_total_params)
+ print("param trainable", pytorch_total_params_train)
+
+ #=====START: ADDED FOR DISTRIBUTED======
+ if num_gpus > 1:
+ model = apply_gradient_allreduce(model)
+ #=====END: ADDED FOR DISTRIBUTED======
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+ if fp16_run:
+ from apex import amp
+ model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
+
+ # Load checkpoint if one exists
+ iteration = 0
+ if checkpoint_path != "":
+ model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
+ optimizer, **squeezewave_config)
+ iteration += 1 # next iteration is iteration + 1
+
+ n_audio_channel = squeezewave_config["n_audio_channel"]
+ trainset = Mel2Samp(n_audio_channel, **data_config)
+ # =====START: ADDED FOR DISTRIBUTED======
+ train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
+ # =====END: ADDED FOR DISTRIBUTED======
+ train_loader = DataLoader(trainset, num_workers=0, shuffle=False,
+ sampler=train_sampler,
+ batch_size=batch_size,
+ pin_memory=False,
+ drop_last=True)
+
+ # Get shared output_directory ready
+ if rank == 0:
+ if not os.path.isdir(output_directory):
+ os.makedirs(output_directory)
+ os.chmod(output_directory, 0o775)
+ print("output directory", output_directory)
+
+ if with_tensorboard and rank == 0:
+ from tensorboardX import SummaryWriter
+ logger = SummaryWriter(os.path.join(output_directory, 'logs'))
+
+ model.train()
+ epoch_offset = max(0, int(iteration / len(train_loader)))
+ # ================ MAIN TRAINNIG LOOP! ===================
+ for epoch in range(epoch_offset, epochs):
+ print("Epoch: {}".format(epoch))
+ for i, batch in enumerate(train_loader):
+ model.zero_grad()
+
+ mel, audio = batch
+ mel = torch.autograd.Variable(mel.cuda())
+ audio = torch.autograd.Variable(audio.cuda())
+ outputs = model((mel, audio))
+
+ loss = criterion(outputs)
+ if num_gpus > 1:
+ reduced_loss = reduce_tensor(loss.data, num_gpus).item()
+ else:
+ reduced_loss = loss.item()
+
+ if fp16_run:
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
+ scaled_loss.backward()
+ else:
+ loss.backward()
+
+ optimizer.step()
+
+ print("{}:\t{:.9f}\t".format(iteration, reduced_loss))
+ if with_tensorboard and rank == 0:
+ logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch)
+ if (iteration % iters_per_checkpoint == 0):
+ if rank == 0:
+ checkpoint_path = "{}/SqueezeWave_{}".format(
+ output_directory, iteration)
+ save_checkpoint(model, optimizer, learning_rate, iteration,
+ checkpoint_path)
+
+ iteration += 1
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--config', type=str,
+ help='JSON file for configuration')
+ parser.add_argument('-r', '--rank', type=int, default=0,
+ help='rank of process for distributed')
+ parser.add_argument('-g', '--group_name', type=str, default='',
+ help='name of group for distributed')
+ args = parser.parse_args()
+
+ # Parse configs. Globals nicer in this case
+ with open(args.config) as f:
+ data = f.read()
+ config = json.loads(data)
+ train_config = config["train_config"]
+ global data_config
+ data_config = config["data_config"]
+ global dist_config
+ dist_config = config["dist_config"]
+ global squeezewave_config
+ squeezewave_config = config["squeezewave_config"]
+
+ num_gpus = torch.cuda.device_count()
+ if num_gpus > 1:
+ if args.group_name == '':
+ print("WARNING: Multiple GPUs detected but no distributed group set")
+ print("Only running 1 GPU. Use distributed.py for multiple GPUs")
+ num_gpus = 1
+
+ if num_gpus == 1 and args.rank != 0:
+ raise Exception("Doing single GPU training on rank > 0")
+
+ torch.backends.cudnn.enabled = True
+ torch.backends.cudnn.benchmark = False
+ train(num_gpus, args.rank, args.group_name, **train_config)