|
|
- import torch
- import torch.nn as nn
- from torch.nn import functional as F
- import numpy as np
- from collections import OrderedDict
-
- from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
- from text.symbols import symbols
-
-
- class Linear(nn.Module):
- """
- Linear Module
- """
-
- def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
- """
- :param in_dim: dimension of input
- :param out_dim: dimension of output
- :param bias: boolean. if True, bias is included.
- :param w_init: str. weight inits with xavier initialization.
- """
- super(Linear, self).__init__()
- self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
-
- nn.init.xavier_uniform_(
- self.linear_layer.weight,
- gain=nn.init.calculate_gain(w_init))
-
- def forward(self, x):
- return self.linear_layer(x)
-
-
- class PreNet(nn.Module):
- """
- Pre Net before passing through the network
- """
-
- def __init__(self, input_size, hidden_size, output_size, p=0.5):
- """
- :param input_size: dimension of input
- :param hidden_size: dimension of hidden unit
- :param output_size: dimension of output
- """
- super(PreNet, self).__init__()
- self.input_size = input_size
- self.output_size = output_size
- self.hidden_size = hidden_size
- self.layer = nn.Sequential(OrderedDict([
- ('fc1', Linear(self.input_size, self.hidden_size)),
- ('relu1', nn.ReLU()),
- ('dropout1', nn.Dropout(p)),
- ('fc2', Linear(self.hidden_size, self.output_size)),
- ('relu2', nn.ReLU()),
- ('dropout2', nn.Dropout(p)),
- ]))
-
- def forward(self, input_):
-
- out = self.layer(input_)
-
- return out
-
-
- class Conv(nn.Module):
- """
- Convolution Module
- """
-
- def __init__(self,
- in_channels,
- out_channels,
- kernel_size=1,
- stride=1,
- padding=0,
- dilation=1,
- bias=True,
- w_init='linear'):
- """
- :param in_channels: dimension of input
- :param out_channels: dimension of output
- :param kernel_size: size of kernel
- :param stride: size of stride
- :param padding: size of padding
- :param dilation: dilation rate
- :param bias: boolean. if True, bias is included.
- :param w_init: str. weight inits with xavier initialization.
- """
- super(Conv, self).__init__()
-
- self.conv = nn.Conv1d(in_channels,
- out_channels,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
-
- nn.init.xavier_uniform_(
- self.conv.weight, gain=nn.init.calculate_gain(w_init))
-
- def forward(self, x):
- x = self.conv(x)
- return x
-
-
- class FFTBlock(torch.nn.Module):
- """FFT Block"""
-
- def __init__(self,
- d_model,
- d_inner,
- n_head,
- d_k,
- d_v,
- dropout=0.1):
- super(FFTBlock, self).__init__()
- self.slf_attn = MultiHeadAttention(
- n_head, d_model, d_k, d_v, dropout=dropout)
- self.pos_ffn = PositionwiseFeedForward(
- d_model, d_inner, dropout=dropout)
-
- def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
- enc_output, enc_slf_attn = self.slf_attn(
- enc_input, enc_input, enc_input, mask=slf_attn_mask)
- enc_output *= non_pad_mask
-
- enc_output = self.pos_ffn(enc_output)
- enc_output *= non_pad_mask
-
- return enc_output, enc_slf_attn
-
-
- class ConvNorm(torch.nn.Module):
- def __init__(self,
- in_channels,
- out_channels,
- kernel_size=1,
- stride=1,
- padding=None,
- dilation=1,
- bias=True,
- w_init_gain='linear'):
- super(ConvNorm, self).__init__()
-
- if padding is None:
- assert(kernel_size % 2 == 1)
- padding = int(dilation * (kernel_size - 1) / 2)
-
- self.conv = torch.nn.Conv1d(in_channels,
- out_channels,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
-
- torch.nn.init.xavier_uniform_(
- self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
-
- def forward(self, signal):
- conv_signal = self.conv(signal)
-
- return conv_signal
-
-
- class PostNet(nn.Module):
- """
- PostNet: Five 1-d convolution with 512 channels and kernel size 5
- """
-
- def __init__(self,
- n_mel_channels=80,
- postnet_embedding_dim=512,
- postnet_kernel_size=5,
- postnet_n_convolutions=5):
-
- super(PostNet, self).__init__()
- self.convolutions = nn.ModuleList()
-
- self.convolutions.append(
- nn.Sequential(
- ConvNorm(n_mel_channels,
- postnet_embedding_dim,
- kernel_size=postnet_kernel_size,
- stride=1,
- padding=int((postnet_kernel_size - 1) / 2),
- dilation=1,
- w_init_gain='tanh'),
-
- nn.BatchNorm1d(postnet_embedding_dim))
- )
-
- for i in range(1, postnet_n_convolutions - 1):
- self.convolutions.append(
- nn.Sequential(
- ConvNorm(postnet_embedding_dim,
- postnet_embedding_dim,
- kernel_size=postnet_kernel_size,
- stride=1,
- padding=int((postnet_kernel_size - 1) / 2),
- dilation=1,
- w_init_gain='tanh'),
-
- nn.BatchNorm1d(postnet_embedding_dim))
- )
-
- self.convolutions.append(
- nn.Sequential(
- ConvNorm(postnet_embedding_dim,
- n_mel_channels,
- kernel_size=postnet_kernel_size,
- stride=1,
- padding=int((postnet_kernel_size - 1) / 2),
- dilation=1,
- w_init_gain='linear'),
-
- nn.BatchNorm1d(n_mel_channels))
- )
-
- def forward(self, x):
- x = x.contiguous().transpose(1, 2)
-
- for i in range(len(self.convolutions) - 1):
- x = F.dropout(torch.tanh(
- self.convolutions[i](x)), 0.5, self.training)
- x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
-
- x = x.contiguous().transpose(1, 2)
- return x
|