dan
/
fastspeech_squeezewave

"""
We retain the copyright notice from the original author. However, we reserve our rights on the modifications based on the original code.
BSD 3-Clause License
Copyright (c) 2017, Prem SeetharamanAll rights reserved.
* Redistribution and use in source and binary forms, with or without  modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,  this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this  list of conditions and the following disclaimer in the  documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its  contributors may be used to endorse or promote products derived from this  software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIEDWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AREDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FORANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ONANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THISSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""

import torchimport numpy as npimport torch.nn.functional as Ffrom torch.autograd import Variablefrom scipy.signal import get_windowfrom librosa.util import pad_center, tinyfrom audio_processing import window_sumsquare

class STFT(torch.nn.Module):    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""    def __init__(self, filter_length=800, hop_length=200, win_length=800,                 window='hann', n_group=256):        super(STFT, self).__init__()        self.filter_length = filter_length        self.hop_length = hop_length        self.win_length = win_length        self.window = window        self.forward_transform = None        self.n_group = n_group        scale = self.filter_length / self.hop_length        fourier_basis = np.fft.fft(np.eye(self.filter_length))
        cutoff = int((self.filter_length / 2 + 1))        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),                                   np.imag(fourier_basis[:cutoff, :])])
        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])        inverse_basis = torch.FloatTensor(            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
        if window is not None:            assert(win_length >= filter_length)            # get window and zero center pad it to filter_length            fft_window = get_window(window, win_length, fftbins=True)            fft_window = pad_center(fft_window, filter_length)            fft_window = torch.from_numpy(fft_window).float()
            # window the bases            forward_basis *= fft_window            inverse_basis *= fft_window
        self.register_buffer('forward_basis', forward_basis.float())        self.register_buffer('inverse_basis', inverse_basis.float())
    def transform(self, input_data):        num_batches = input_data.size(0)        num_samples = input_data.size(1)
        self.num_samples = num_samples
        # similar to librosa, reflect-pad the input        input_data = input_data.view(num_batches, 1, num_samples)        pad = ((64 - 1) * self.hop_length + self.filter_length - num_samples) // 2         if pad < 0:          pad = self.filter_length // 2        input_data = F.pad(            input_data.unsqueeze(1),            (int(pad), int(pad), 0, 0),            mode='reflect')        input_data = input_data.squeeze(1)
        forward_transform = F.conv1d(            input_data,            Variable(self.forward_basis, requires_grad=False),            stride=self.hop_length,            padding=0)
        cutoff = int((self.filter_length / 2) + 1)        real_part = forward_transform[:, :cutoff, :]        imag_part = forward_transform[:, cutoff:, :]
        magnitude = torch.sqrt(real_part**2 + imag_part**2)        phase = torch.autograd.Variable(            torch.atan2(imag_part.data, real_part.data))
        return magnitude, phase
    def inverse(self, magnitude, phase):        recombine_magnitude_phase = torch.cat(            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
        inverse_transform = F.conv_transpose1d(            recombine_magnitude_phase,            Variable(self.inverse_basis, requires_grad=False),            stride=self.hop_length,            padding=0)
        if self.window is not None:            window_sum = window_sumsquare(                self.window, magnitude.size(-1), hop_length=self.hop_length,                win_length=self.win_length, n_fft=self.filter_length,                dtype=np.float32)            # remove modulation effects            approx_nonzero_indices = torch.from_numpy(                np.where(window_sum > tiny(window_sum))[0])            window_sum = torch.autograd.Variable(                torch.from_numpy(window_sum), requires_grad=False)            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
            # scale by hop ratio            inverse_transform *= float(self.filter_length) / self.hop_length
        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
        return inverse_transform
    def forward(self, input_data):        self.magnitude, self.phase = self.transform(input_data)        reconstruction = self.inverse(self.magnitude, self.phase)        return reconstruction