{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Tacotron 2 inference code \n",
|
|
"Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Import libraries and setup matplotlib"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/dcg-adlr-rafaelvalle-source.cosmos597/repos/nvidia/tacotron2/plotting_utils.py:2: UserWarning: matplotlib.pyplot as already been imported, this call will have no effect.\n",
|
|
" matplotlib.use(\"Agg\")\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import matplotlib\n",
|
|
"matplotlib.use(\"Agg\")\n",
|
|
"import matplotlib.pylab as plt\n",
|
|
"%matplotlib inline\n",
|
|
"import IPython.display as ipd\n",
|
|
"\n",
|
|
"import sys\n",
|
|
"sys.path.append('waveglow/')\n",
|
|
"import numpy as np\n",
|
|
"import torch\n",
|
|
"\n",
|
|
"from hparams import create_hparams\n",
|
|
"from model import Tacotron2\n",
|
|
"from layers import TacotronSTFT\n",
|
|
"from audio_processing import griffin_lim\n",
|
|
"from train import load_model\n",
|
|
"from text import text_to_sequence\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def plot_data(data, figsize=(16, 4)):\n",
|
|
" fig, axes = plt.subplots(1, len(data), figsize=figsize)\n",
|
|
" for i in range(len(data)):\n",
|
|
" axes[i].imshow(data[i], aspect='auto', origin='bottom', \n",
|
|
" interpolation='none')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Setup hparams"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"hparams = create_hparams(\"distributed_run=False,mask_padding=False\")\n",
|
|
"hparams.sampling_rate = 22050\n",
|
|
"hparams.filter_length = 1024\n",
|
|
"hparams.hop_length = 256\n",
|
|
"hparams.win_length = 1024"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Load model from checkpoint"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"checkpoint_path = \"tacotron2_statedict\"\n",
|
|
"\n",
|
|
"model = load_model(hparams)\n",
|
|
"try:\n",
|
|
" model = model.module\n",
|
|
"except:\n",
|
|
" pass\n",
|
|
"model.load_state_dict({k.replace('module.',''):v for k,v in torch.load(checkpoint_path)['state_dict'].items()})\n",
|
|
"_ = model.eval()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Load WaveGlow for mel2audio synthesis"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"waveglow_path = 'waveglow_old.pt'\n",
|
|
"waveglow = torch.load(waveglow_path)['model']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Prepare text input"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text = \"Waveglow is really awesome!\"\n",
|
|
"sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]\n",
|
|
"sequence = torch.autograd.Variable(\n",
|
|
" torch.from_numpy(sequence)).cuda().long()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Decode text input and plot results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)\n",
|
|
"plot_data((mel_outputs.data.cpu().numpy()[0],\n",
|
|
" mel_outputs_postnet.data.cpu().numpy()[0],\n",
|
|
" alignments.data.cpu().numpy()[0].T))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Synthesize audio from spectrogram using WaveGlow"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with torch.no_grad():\n",
|
|
" audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)\n",
|
|
"ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|