{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "SqueezeWave computational complexity.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python2",
      "display_name": "Python 2"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "s8VYGy15fwqN",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import numpy as np"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MDp5WalGf5Ji",
        "colab_type": "text"
      },
      "source": [
        "**WaveGlow**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wrBBjKSYf89M",
        "colab_type": "code",
        "outputId": "4d77bc19-7a81-4f0b-bcad-65c42c4b2e9c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 136
        }
      },
      "source": [
        "L = 2048 # audio length\n",
        "n_audio_channel_init = 8 # initial audio channel \n",
        "C_mel = 80 * 8 # After upsampling and unfolding \n",
        "kernal_size = 3\n",
        "C_wn = 256 # input channel size of in_layer\n",
        "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
        "n_flows = 12\n",
        "n_layers = 8\n",
        "n_early_output = 2\n",
        "n_early_output_interval = 4\n",
        "duration = 0.725\n",
        "\n",
        "n_audio_channels = []\n",
        "n_audio = n_audio_channel_init\n",
        "for i in range(n_flows):\n",
        "  if i % n_early_output_interval == 0 and i > 0:\n",
        "    n_audio -= n_early_output\n",
        "  n_audio_channels.append(n_audio) # audio channel after early output\n",
        "\n",
        "# in_layers\n",
        "WN_in_layers = L * kernal_size * C_wn * C_wn_middle * n_layers * n_flows\n",
        "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
        "# cond layers\n",
        "WN_cond_layers = L * C_mel * C_wn_middle * n_layers * n_flows \n",
        "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
        "# res skip layers\n",
        "WN_res_layers = (L * C_wn * C_wn_middle * (n_layers - 1) + L * C_wn * C_wn) * n_flows\n",
        "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
        "# invertible convs\n",
        "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
        "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
        "# start\n",
        "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
        "print('MACs of start conv layers', starts / duration / 1e9)\n",
        "# end\n",
        "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
        "print('MACs of end conv layers', ends / duration / 1e9)\n",
        "# total\n",
        "WG_total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
        "print('Total number of MACs is', WG_total / duration / 1e9)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "('MACs of in_layers', 106.63367079724138)\n",
            "('MACs of cond_layers', 88.86139233103448)\n",
            "('MACs of res_skip_layers', 33.32302212413793)\n",
            "('MACs of invertible conv layers', 0.00131072)\n",
            "('MACs of start conv layers', 0.02603361103448276)\n",
            "('MACs of end conv layers', 0.05206722206896552)\n",
            "('Total number of MACs is', 228.89749680551725)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QRQheCWjgC9D",
        "colab_type": "text"
      },
      "source": [
        "SqueezeWave L=64, C=128"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zSlwPlvUgJue",
        "colab_type": "code",
        "outputId": "18e282ea-a071-4117-ba08-6e6abdc36c68",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 153
        }
      },
      "source": [
        "L = 64 # audio length\n",
        "n_audio_channel_init = 256 # initial audio channel \n",
        "L_mel = 64 # mel-spectrogram length\n",
        "C_mel =80 # mel-spectrogram channel \n",
        "kernal_size = 3\n",
        "C_wn = 128 # input channel size of in_layer\n",
        "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
        "n_flows = 12\n",
        "n_layers = 8\n",
        "n_early_output = 16\n",
        "n_early_output_interval = 2\n",
        "duration = 0.725\n",
        "\n",
        "n_audio_channels = []\n",
        "n_audio = n_audio_channel_init\n",
        "for i in range(n_flows):\n",
        "  if i % n_early_output_interval == 0 and i > 0:\n",
        "    n_audio -= n_early_output\n",
        "  n_audio_channels.append(n_audio) # audio channel after early output\n",
        "\n",
        "# in_layers\n",
        "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
        "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
        "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
        "# cond_layers\n",
        "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
        "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
        "# res_skip_layers\n",
        "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
        "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
        "# invertible convs\n",
        "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
        "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
        "# start\n",
        "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
        "print('MACs of start conv layers', starts / duration / 1e9)\n",
        "#end\n",
        "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
        "print('MACs of end conv layers', ends / duration / 1e9)\n",
        "# total\n",
        "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
        "print('Total number of MACs is', total / duration / 1e9)\n",
        "print('Reduction compared with WaveGlow', WG_total / total)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "('MACs of in_layers', 0.2809460524137931)\n",
            "('MACs of cond_layers', 0.17355740689655172)\n",
            "('MACs of res_skip_layers', 0.1388459255172414)\n",
            "('MACs of invertible conv layers', 0.0502141351724138)\n",
            "('MACs of start conv layers', 0.014643906206896554)\n",
            "('MACs of end conv layers', 0.029287812413793107)\n",
            "('Total number of MACs is', 0.6874952386206896)\n",
            "('Reduction compared with WaveGlow', 332)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "M6K8zJ6cugYj",
        "colab_type": "text"
      },
      "source": [
        "**SqueezeWave L=64, C=256**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ju5Xa4oAhScO",
        "colab_type": "code",
        "outputId": "c91361be-ff73-4113-a584-6dda74c3690e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 153
        }
      },
      "source": [
        "L = 64 # audio length\n",
        "n_audio_channel_init = 256 # initial audio channel \n",
        "L_mel = 64 # mel-spectrogram length\n",
        "C_mel =80 # mel-spectrogram channel \n",
        "kernal_size = 3\n",
        "C_wn = 256 # input channel size of in_layer\n",
        "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
        "n_flows = 12\n",
        "n_layers = 8\n",
        "n_early_output = 16\n",
        "n_early_output_interval = 2\n",
        "duration = 0.725\n",
        "\n",
        "n_audio_channels = []\n",
        "n_audio = n_audio_channel_init\n",
        "for i in range(n_flows):\n",
        "  if i % n_early_output_interval == 0 and i > 0:\n",
        "    n_audio -= n_early_output\n",
        "  n_audio_channels.append(n_audio) # audio channel after early output\n",
        "\n",
        "# in_layers\n",
        "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
        "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
        "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
        "# cond_layers\n",
        "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
        "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
        "# res_skip_layers\n",
        "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
        "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
        "# invertible convs\n",
        "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
        "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
        "# start\n",
        "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
        "print('MACs of start conv layers', starts / duration / 1e9)\n",
        "#end\n",
        "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
        "print('MACs of end conv layers', ends / duration / 1e9)\n",
        "# total\n",
        "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
        "print('Total number of MACs is', total / duration / 1e9)\n",
        "print('Reduction compared with WaveGlow', WG_total / total)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "('MACs of in_layers', 1.1172758068965518)\n",
            "('MACs of cond_layers', 0.34711481379310344)\n",
            "('MACs of res_skip_layers', 0.5553837020689656)\n",
            "('MACs of invertible conv layers', 0.0502141351724138)\n",
            "('MACs of start conv layers', 0.029287812413793107)\n",
            "('MACs of end conv layers', 0.058575624827586215)\n",
            "('Total number of MACs is', 2.157851895172414)\n",
            "('Reduction compared with WaveGlow', 106)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aIgnX6Yi4BFu",
        "colab_type": "text"
      },
      "source": [
        "**SqueezeWave L=128, C=128**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "W-3Q5jW84F_t",
        "colab_type": "code",
        "outputId": "436038c3-f3f8-4989-eeec-eb59c154b183",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 153
        }
      },
      "source": [
        "L = 128 # audio length\n",
        "n_audio_channel_init = 128 # initial audio channel \n",
        "L_mel = 64 # mel-spectrogram length\n",
        "C_mel =80 # mel-spectrogram channel \n",
        "kernal_size = 3\n",
        "C_wn = 128 # input channel size of in_layer\n",
        "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
        "n_flows = 12\n",
        "n_layers = 8\n",
        "n_early_output = 16\n",
        "n_early_output_interval = 2\n",
        "duration = 0.725\n",
        "\n",
        "n_audio_channels = []\n",
        "n_audio = n_audio_channel_init\n",
        "for i in range(n_flows):\n",
        "  if i % n_early_output_interval == 0 and i > 0:\n",
        "    n_audio -= n_early_output\n",
        "  n_audio_channels.append(n_audio) # audio channel after early output\n",
        "\n",
        "# in_layers\n",
        "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
        "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
        "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
        "# cond_layers\n",
        "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
        "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
        "# res_skip_layers\n",
        "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
        "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
        "# invertible convs\n",
        "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
        "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
        "# start\n",
        "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
        "print('MACs of start conv layers', starts / duration / 1e9)\n",
        "#end\n",
        "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
        "print('MACs of end conv layers', ends / duration / 1e9)\n",
        "# total\n",
        "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
        "print('Total number of MACs is', total / duration / 1e9)\n",
        "print('Reduction compared with WaveGlow', WG_total / total)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "('MACs of in_layers', 0.5618921048275862)\n",
            "('MACs of cond_layers', 0.17355740689655172)\n",
            "('MACs of res_skip_layers', 0.2776918510344828)\n",
            "('MACs of invertible conv layers', 0.017988502068965517)\n",
            "('MACs of start conv layers', 0.011932071724137933)\n",
            "('MACs of end conv layers', 0.023864143448275865)\n",
            "('Total number of MACs is', 1.06692608)\n",
            "('Reduction compared with WaveGlow', 214)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1kWvIBWU4Vwm",
        "colab_type": "text"
      },
      "source": [
        "**SqueezeWave L=128, C=256**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6YM2bkC14WWc",
        "colab_type": "code",
        "outputId": "b1fd3d03-0135-400e-cfbc-28746c8d0cf0",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 153
        }
      },
      "source": [
        "L = 128 # audio length\n",
        "n_audio_channel_init = 128 # initial audio channel \n",
        "L_mel = 64 # mel-spectrogram length\n",
        "C_mel =80 # mel-spectrogram channel \n",
        "kernal_size = 3\n",
        "C_wn = 256 # input channel size of in_layer\n",
        "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
        "n_flows = 12\n",
        "n_layers = 8\n",
        "n_early_output = 16\n",
        "n_early_output_interval = 2\n",
        "duration = 0.725\n",
        "\n",
        "n_audio_channels = []\n",
        "n_audio = n_audio_channel_init\n",
        "for i in range(n_flows):\n",
        "  if i % n_early_output_interval == 0 and i > 0:\n",
        "    n_audio -= n_early_output\n",
        "  n_audio_channels.append(n_audio) # audio channel after early output\n",
        "\n",
        "# in_layers\n",
        "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
        "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
        "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
        "# cond_layers\n",
        "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
        "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
        "# res_skip_layers\n",
        "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
        "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
        "# invertible convs\n",
        "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
        "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
        "# start\n",
        "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
        "print('MACs of start conv layers', starts / duration / 1e9)\n",
        "#end\n",
        "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
        "print('MACs of end conv layers', ends / duration / 1e9)\n",
        "# total\n",
        "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
        "print('Total number of MACs is', total / duration / 1e9)\n",
        "print('Reduction compared with WaveGlow', WG_total / total)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "('MACs of in_layers', 2.2345516137931036)\n",
            "('MACs of cond_layers', 0.34711481379310344)\n",
            "('MACs of res_skip_layers', 1.1107674041379312)\n",
            "('MACs of invertible conv layers', 0.017988502068965517)\n",
            "('MACs of start conv layers', 0.023864143448275865)\n",
            "('MACs of end conv layers', 0.04772828689655173)\n",
            "('Total number of MACs is', 3.7820147641379314)\n",
            "('Reduction compared with WaveGlow', 60)\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}