import torch
|
|
import torch.nn as nn
|
|
from torch.nn import functional as F
|
|
import numpy as np
|
|
from collections import OrderedDict
|
|
|
|
from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
|
|
from text.symbols import symbols
|
|
|
|
|
|
class Linear(nn.Module):
|
|
"""
|
|
Linear Module
|
|
"""
|
|
|
|
def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
|
|
"""
|
|
:param in_dim: dimension of input
|
|
:param out_dim: dimension of output
|
|
:param bias: boolean. if True, bias is included.
|
|
:param w_init: str. weight inits with xavier initialization.
|
|
"""
|
|
super(Linear, self).__init__()
|
|
self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
|
|
|
|
nn.init.xavier_uniform_(
|
|
self.linear_layer.weight,
|
|
gain=nn.init.calculate_gain(w_init))
|
|
|
|
def forward(self, x):
|
|
return self.linear_layer(x)
|
|
|
|
|
|
class PreNet(nn.Module):
|
|
"""
|
|
Pre Net before passing through the network
|
|
"""
|
|
|
|
def __init__(self, input_size, hidden_size, output_size, p=0.5):
|
|
"""
|
|
:param input_size: dimension of input
|
|
:param hidden_size: dimension of hidden unit
|
|
:param output_size: dimension of output
|
|
"""
|
|
super(PreNet, self).__init__()
|
|
self.input_size = input_size
|
|
self.output_size = output_size
|
|
self.hidden_size = hidden_size
|
|
self.layer = nn.Sequential(OrderedDict([
|
|
('fc1', Linear(self.input_size, self.hidden_size)),
|
|
('relu1', nn.ReLU()),
|
|
('dropout1', nn.Dropout(p)),
|
|
('fc2', Linear(self.hidden_size, self.output_size)),
|
|
('relu2', nn.ReLU()),
|
|
('dropout2', nn.Dropout(p)),
|
|
]))
|
|
|
|
def forward(self, input_):
|
|
|
|
out = self.layer(input_)
|
|
|
|
return out
|
|
|
|
|
|
class Conv(nn.Module):
|
|
"""
|
|
Convolution Module
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_channels,
|
|
out_channels,
|
|
kernel_size=1,
|
|
stride=1,
|
|
padding=0,
|
|
dilation=1,
|
|
bias=True,
|
|
w_init='linear'):
|
|
"""
|
|
:param in_channels: dimension of input
|
|
:param out_channels: dimension of output
|
|
:param kernel_size: size of kernel
|
|
:param stride: size of stride
|
|
:param padding: size of padding
|
|
:param dilation: dilation rate
|
|
:param bias: boolean. if True, bias is included.
|
|
:param w_init: str. weight inits with xavier initialization.
|
|
"""
|
|
super(Conv, self).__init__()
|
|
|
|
self.conv = nn.Conv1d(in_channels,
|
|
out_channels,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
nn.init.xavier_uniform_(
|
|
self.conv.weight, gain=nn.init.calculate_gain(w_init))
|
|
|
|
def forward(self, x):
|
|
x = self.conv(x)
|
|
return x
|
|
|
|
|
|
class FFTBlock(torch.nn.Module):
|
|
"""FFT Block"""
|
|
|
|
def __init__(self,
|
|
d_model,
|
|
d_inner,
|
|
n_head,
|
|
d_k,
|
|
d_v,
|
|
dropout=0.1):
|
|
super(FFTBlock, self).__init__()
|
|
self.slf_attn = MultiHeadAttention(
|
|
n_head, d_model, d_k, d_v, dropout=dropout)
|
|
self.pos_ffn = PositionwiseFeedForward(
|
|
d_model, d_inner, dropout=dropout)
|
|
|
|
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
|
|
enc_output, enc_slf_attn = self.slf_attn(
|
|
enc_input, enc_input, enc_input, mask=slf_attn_mask)
|
|
enc_output *= non_pad_mask
|
|
|
|
enc_output = self.pos_ffn(enc_output)
|
|
enc_output *= non_pad_mask
|
|
|
|
return enc_output, enc_slf_attn
|
|
|
|
|
|
class ConvNorm(torch.nn.Module):
|
|
def __init__(self,
|
|
in_channels,
|
|
out_channels,
|
|
kernel_size=1,
|
|
stride=1,
|
|
padding=None,
|
|
dilation=1,
|
|
bias=True,
|
|
w_init_gain='linear'):
|
|
super(ConvNorm, self).__init__()
|
|
|
|
if padding is None:
|
|
assert(kernel_size % 2 == 1)
|
|
padding = int(dilation * (kernel_size - 1) / 2)
|
|
|
|
self.conv = torch.nn.Conv1d(in_channels,
|
|
out_channels,
|
|
kernel_size=kernel_size,
|
|
stride=stride,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
bias=bias)
|
|
|
|
torch.nn.init.xavier_uniform_(
|
|
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
|
|
|
def forward(self, signal):
|
|
conv_signal = self.conv(signal)
|
|
|
|
return conv_signal
|
|
|
|
|
|
class PostNet(nn.Module):
|
|
"""
|
|
PostNet: Five 1-d convolution with 512 channels and kernel size 5
|
|
"""
|
|
|
|
def __init__(self,
|
|
n_mel_channels=80,
|
|
postnet_embedding_dim=512,
|
|
postnet_kernel_size=5,
|
|
postnet_n_convolutions=5):
|
|
|
|
super(PostNet, self).__init__()
|
|
self.convolutions = nn.ModuleList()
|
|
|
|
self.convolutions.append(
|
|
nn.Sequential(
|
|
ConvNorm(n_mel_channels,
|
|
postnet_embedding_dim,
|
|
kernel_size=postnet_kernel_size,
|
|
stride=1,
|
|
padding=int((postnet_kernel_size - 1) / 2),
|
|
dilation=1,
|
|
w_init_gain='tanh'),
|
|
|
|
nn.BatchNorm1d(postnet_embedding_dim))
|
|
)
|
|
|
|
for i in range(1, postnet_n_convolutions - 1):
|
|
self.convolutions.append(
|
|
nn.Sequential(
|
|
ConvNorm(postnet_embedding_dim,
|
|
postnet_embedding_dim,
|
|
kernel_size=postnet_kernel_size,
|
|
stride=1,
|
|
padding=int((postnet_kernel_size - 1) / 2),
|
|
dilation=1,
|
|
w_init_gain='tanh'),
|
|
|
|
nn.BatchNorm1d(postnet_embedding_dim))
|
|
)
|
|
|
|
self.convolutions.append(
|
|
nn.Sequential(
|
|
ConvNorm(postnet_embedding_dim,
|
|
n_mel_channels,
|
|
kernel_size=postnet_kernel_size,
|
|
stride=1,
|
|
padding=int((postnet_kernel_size - 1) / 2),
|
|
dilation=1,
|
|
w_init_gain='linear'),
|
|
|
|
nn.BatchNorm1d(n_mel_channels))
|
|
)
|
|
|
|
def forward(self, x):
|
|
x = x.contiguous().transpose(1, 2)
|
|
|
|
for i in range(len(self.convolutions) - 1):
|
|
x = F.dropout(torch.tanh(
|
|
self.convolutions[i](x)), 0.5, self.training)
|
|
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
|
|
|
|
x = x.contiguous().transpose(1, 2)
|
|
return x
|