import torch import torch.nn as nn from torch.nn import functional as F import numpy as np from collections import OrderedDict from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward from text.symbols import symbols class Linear(nn.Module): """ Linear Module """ def __init__(self, in_dim, out_dim, bias=True, w_init='linear'): """ :param in_dim: dimension of input :param out_dim: dimension of output :param bias: boolean. if True, bias is included. :param w_init: str. weight inits with xavier initialization. """ super(Linear, self).__init__() self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias) nn.init.xavier_uniform_( self.linear_layer.weight, gain=nn.init.calculate_gain(w_init)) def forward(self, x): return self.linear_layer(x) class PreNet(nn.Module): """ Pre Net before passing through the network """ def __init__(self, input_size, hidden_size, output_size, p=0.5): """ :param input_size: dimension of input :param hidden_size: dimension of hidden unit :param output_size: dimension of output """ super(PreNet, self).__init__() self.input_size = input_size self.output_size = output_size self.hidden_size = hidden_size self.layer = nn.Sequential(OrderedDict([ ('fc1', Linear(self.input_size, self.hidden_size)), ('relu1', nn.ReLU()), ('dropout1', nn.Dropout(p)), ('fc2', Linear(self.hidden_size, self.output_size)), ('relu2', nn.ReLU()), ('dropout2', nn.Dropout(p)), ])) def forward(self, input_): out = self.layer(input_) return out class Conv(nn.Module): """ Convolution Module """ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, w_init='linear'): """ :param in_channels: dimension of input :param out_channels: dimension of output :param kernel_size: size of kernel :param stride: size of stride :param padding: size of padding :param dilation: dilation rate :param bias: boolean. if True, bias is included. :param w_init: str. weight inits with xavier initialization. """ super(Conv, self).__init__() self.conv = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) nn.init.xavier_uniform_( self.conv.weight, gain=nn.init.calculate_gain(w_init)) def forward(self, x): x = self.conv(x) return x class FFTBlock(torch.nn.Module): """FFT Block""" def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(FFTBlock, self).__init__() self.slf_attn = MultiHeadAttention( n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward( d_model, d_inner, dropout=dropout) def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): enc_output, enc_slf_attn = self.slf_attn( enc_input, enc_input, enc_input, mask=slf_attn_mask) enc_output *= non_pad_mask enc_output = self.pos_ffn(enc_output) enc_output *= non_pad_mask return enc_output, enc_slf_attn class ConvNorm(torch.nn.Module): def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=None, dilation=1, bias=True, w_init_gain='linear'): super(ConvNorm, self).__init__() if padding is None: assert(kernel_size % 2 == 1) padding = int(dilation * (kernel_size - 1) / 2) self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) torch.nn.init.xavier_uniform_( self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) def forward(self, signal): conv_signal = self.conv(signal) return conv_signal class PostNet(nn.Module): """ PostNet: Five 1-d convolution with 512 channels and kernel size 5 """ def __init__(self, n_mel_channels=80, postnet_embedding_dim=512, postnet_kernel_size=5, postnet_n_convolutions=5): super(PostNet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( nn.Sequential( ConvNorm(n_mel_channels, postnet_embedding_dim, kernel_size=postnet_kernel_size, stride=1, padding=int((postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(postnet_embedding_dim)) ) for i in range(1, postnet_n_convolutions - 1): self.convolutions.append( nn.Sequential( ConvNorm(postnet_embedding_dim, postnet_embedding_dim, kernel_size=postnet_kernel_size, stride=1, padding=int((postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(postnet_embedding_dim)) ) self.convolutions.append( nn.Sequential( ConvNorm(postnet_embedding_dim, n_mel_channels, kernel_size=postnet_kernel_size, stride=1, padding=int((postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='linear'), nn.BatchNorm1d(n_mel_channels)) ) def forward(self, x): x = x.contiguous().transpose(1, 2) for i in range(len(self.convolutions) - 1): x = F.dropout(torch.tanh( self.convolutions[i](x)), 0.5, self.training) x = F.dropout(self.convolutions[-1](x), 0.5, self.training) x = x.contiguous().transpose(1, 2) return x