|
|
- import torch
- from librosa.filters import mel as librosa_mel_fn
- from audio_processing import dynamic_range_compression
- from audio_processing import dynamic_range_decompression
- from stft import STFT
-
-
- class LinearNorm(torch.nn.Module):
- def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
- super(LinearNorm, self).__init__()
- self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-
- torch.nn.init.xavier_uniform(
- self.linear_layer.weight,
- gain=torch.nn.init.calculate_gain(w_init_gain))
-
- def forward(self, x):
- return self.linear_layer(x)
-
-
- class ConvNorm(torch.nn.Module):
- def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
- padding=None, dilation=1, bias=True, w_init_gain='linear'):
- super(ConvNorm, self).__init__()
- if padding is None:
- assert(kernel_size % 2 == 1)
- padding = int(dilation * (kernel_size - 1) / 2)
-
- self.conv = torch.nn.Conv1d(in_channels, out_channels,
- kernel_size=kernel_size, stride=stride,
- padding=padding, dilation=dilation,
- bias=bias)
-
- torch.nn.init.xavier_uniform(
- self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
-
- def forward(self, signal):
- conv_signal = self.conv(signal)
- return conv_signal
-
-
- class TacotronSTFT(torch.nn.Module):
- def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
- n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
- mel_fmax=None, n_group=256):
- super(TacotronSTFT, self).__init__()
- self.n_mel_channels = n_mel_channels
- self.sampling_rate = sampling_rate
- self.stft_fn = STFT(filter_length, hop_length, win_length, n_group=n_group)
- mel_basis = librosa_mel_fn(
- sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
- mel_basis = torch.from_numpy(mel_basis).float()
- self.register_buffer('mel_basis', mel_basis)
-
- def spectral_normalize(self, magnitudes):
- output = dynamic_range_compression(magnitudes)
- return output
-
- def spectral_de_normalize(self, magnitudes):
- output = dynamic_range_decompression(magnitudes)
- return output
-
- def mel_spectrogram(self, y):
- """Computes mel-spectrograms from a batch of waves
- PARAMS
- ------
- y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
-
- RETURNS
- -------
- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
- """
- assert(torch.min(y.data) >= -1)
- assert(torch.max(y.data) <= 1)
-
- magnitudes, phases = self.stft_fn.transform(y)
- magnitudes = magnitudes.data
- mel_output = torch.matmul(self.mel_basis, magnitudes)
- mel_output = self.spectral_normalize(mel_output)
- return mel_output
|