Fork of https://github.com/alokprasad/fastspeech_squeezewave to also fix denoising in squeezewave
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

444 lines
17 KiB

  1. {
  2. "nbformat": 4,
  3. "nbformat_minor": 0,
  4. "metadata": {
  5. "colab": {
  6. "name": "SqueezeWave computational complexity.ipynb",
  7. "provenance": []
  8. },
  9. "kernelspec": {
  10. "name": "python2",
  11. "display_name": "Python 2"
  12. }
  13. },
  14. "cells": [
  15. {
  16. "cell_type": "code",
  17. "metadata": {
  18. "id": "s8VYGy15fwqN",
  19. "colab_type": "code",
  20. "colab": {}
  21. },
  22. "source": [
  23. "import numpy as np"
  24. ],
  25. "execution_count": 0,
  26. "outputs": []
  27. },
  28. {
  29. "cell_type": "markdown",
  30. "metadata": {
  31. "id": "MDp5WalGf5Ji",
  32. "colab_type": "text"
  33. },
  34. "source": [
  35. "**WaveGlow**"
  36. ]
  37. },
  38. {
  39. "cell_type": "code",
  40. "metadata": {
  41. "id": "wrBBjKSYf89M",
  42. "colab_type": "code",
  43. "outputId": "4d77bc19-7a81-4f0b-bcad-65c42c4b2e9c",
  44. "colab": {
  45. "base_uri": "https://localhost:8080/",
  46. "height": 136
  47. }
  48. },
  49. "source": [
  50. "L = 2048 # audio length\n",
  51. "n_audio_channel_init = 8 # initial audio channel \n",
  52. "C_mel = 80 * 8 # After upsampling and unfolding \n",
  53. "kernal_size = 3\n",
  54. "C_wn = 256 # input channel size of in_layer\n",
  55. "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
  56. "n_flows = 12\n",
  57. "n_layers = 8\n",
  58. "n_early_output = 2\n",
  59. "n_early_output_interval = 4\n",
  60. "duration = 0.725\n",
  61. "\n",
  62. "n_audio_channels = []\n",
  63. "n_audio = n_audio_channel_init\n",
  64. "for i in range(n_flows):\n",
  65. " if i % n_early_output_interval == 0 and i > 0:\n",
  66. " n_audio -= n_early_output\n",
  67. " n_audio_channels.append(n_audio) # audio channel after early output\n",
  68. "\n",
  69. "# in_layers\n",
  70. "WN_in_layers = L * kernal_size * C_wn * C_wn_middle * n_layers * n_flows\n",
  71. "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
  72. "# cond layers\n",
  73. "WN_cond_layers = L * C_mel * C_wn_middle * n_layers * n_flows \n",
  74. "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
  75. "# res skip layers\n",
  76. "WN_res_layers = (L * C_wn * C_wn_middle * (n_layers - 1) + L * C_wn * C_wn) * n_flows\n",
  77. "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
  78. "# invertible convs\n",
  79. "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
  80. "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
  81. "# start\n",
  82. "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
  83. "print('MACs of start conv layers', starts / duration / 1e9)\n",
  84. "# end\n",
  85. "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
  86. "print('MACs of end conv layers', ends / duration / 1e9)\n",
  87. "# total\n",
  88. "WG_total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
  89. "print('Total number of MACs is', WG_total / duration / 1e9)"
  90. ],
  91. "execution_count": 0,
  92. "outputs": [
  93. {
  94. "output_type": "stream",
  95. "text": [
  96. "('MACs of in_layers', 106.63367079724138)\n",
  97. "('MACs of cond_layers', 88.86139233103448)\n",
  98. "('MACs of res_skip_layers', 33.32302212413793)\n",
  99. "('MACs of invertible conv layers', 0.00131072)\n",
  100. "('MACs of start conv layers', 0.02603361103448276)\n",
  101. "('MACs of end conv layers', 0.05206722206896552)\n",
  102. "('Total number of MACs is', 228.89749680551725)\n"
  103. ],
  104. "name": "stdout"
  105. }
  106. ]
  107. },
  108. {
  109. "cell_type": "markdown",
  110. "metadata": {
  111. "id": "QRQheCWjgC9D",
  112. "colab_type": "text"
  113. },
  114. "source": [
  115. "SqueezeWave L=64, C=128"
  116. ]
  117. },
  118. {
  119. "cell_type": "code",
  120. "metadata": {
  121. "id": "zSlwPlvUgJue",
  122. "colab_type": "code",
  123. "outputId": "18e282ea-a071-4117-ba08-6e6abdc36c68",
  124. "colab": {
  125. "base_uri": "https://localhost:8080/",
  126. "height": 153
  127. }
  128. },
  129. "source": [
  130. "L = 64 # audio length\n",
  131. "n_audio_channel_init = 256 # initial audio channel \n",
  132. "L_mel = 64 # mel-spectrogram length\n",
  133. "C_mel =80 # mel-spectrogram channel \n",
  134. "kernal_size = 3\n",
  135. "C_wn = 128 # input channel size of in_layer\n",
  136. "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
  137. "n_flows = 12\n",
  138. "n_layers = 8\n",
  139. "n_early_output = 16\n",
  140. "n_early_output_interval = 2\n",
  141. "duration = 0.725\n",
  142. "\n",
  143. "n_audio_channels = []\n",
  144. "n_audio = n_audio_channel_init\n",
  145. "for i in range(n_flows):\n",
  146. " if i % n_early_output_interval == 0 and i > 0:\n",
  147. " n_audio -= n_early_output\n",
  148. " n_audio_channels.append(n_audio) # audio channel after early output\n",
  149. "\n",
  150. "# in_layers\n",
  151. "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
  152. "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
  153. "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
  154. "# cond_layers\n",
  155. "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
  156. "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
  157. "# res_skip_layers\n",
  158. "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
  159. "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
  160. "# invertible convs\n",
  161. "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
  162. "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
  163. "# start\n",
  164. "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
  165. "print('MACs of start conv layers', starts / duration / 1e9)\n",
  166. "#end\n",
  167. "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
  168. "print('MACs of end conv layers', ends / duration / 1e9)\n",
  169. "# total\n",
  170. "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
  171. "print('Total number of MACs is', total / duration / 1e9)\n",
  172. "print('Reduction compared with WaveGlow', WG_total / total)"
  173. ],
  174. "execution_count": 0,
  175. "outputs": [
  176. {
  177. "output_type": "stream",
  178. "text": [
  179. "('MACs of in_layers', 0.2809460524137931)\n",
  180. "('MACs of cond_layers', 0.17355740689655172)\n",
  181. "('MACs of res_skip_layers', 0.1388459255172414)\n",
  182. "('MACs of invertible conv layers', 0.0502141351724138)\n",
  183. "('MACs of start conv layers', 0.014643906206896554)\n",
  184. "('MACs of end conv layers', 0.029287812413793107)\n",
  185. "('Total number of MACs is', 0.6874952386206896)\n",
  186. "('Reduction compared with WaveGlow', 332)\n"
  187. ],
  188. "name": "stdout"
  189. }
  190. ]
  191. },
  192. {
  193. "cell_type": "markdown",
  194. "metadata": {
  195. "id": "M6K8zJ6cugYj",
  196. "colab_type": "text"
  197. },
  198. "source": [
  199. "**SqueezeWave L=64, C=256**"
  200. ]
  201. },
  202. {
  203. "cell_type": "code",
  204. "metadata": {
  205. "id": "ju5Xa4oAhScO",
  206. "colab_type": "code",
  207. "outputId": "c91361be-ff73-4113-a584-6dda74c3690e",
  208. "colab": {
  209. "base_uri": "https://localhost:8080/",
  210. "height": 153
  211. }
  212. },
  213. "source": [
  214. "L = 64 # audio length\n",
  215. "n_audio_channel_init = 256 # initial audio channel \n",
  216. "L_mel = 64 # mel-spectrogram length\n",
  217. "C_mel =80 # mel-spectrogram channel \n",
  218. "kernal_size = 3\n",
  219. "C_wn = 256 # input channel size of in_layer\n",
  220. "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
  221. "n_flows = 12\n",
  222. "n_layers = 8\n",
  223. "n_early_output = 16\n",
  224. "n_early_output_interval = 2\n",
  225. "duration = 0.725\n",
  226. "\n",
  227. "n_audio_channels = []\n",
  228. "n_audio = n_audio_channel_init\n",
  229. "for i in range(n_flows):\n",
  230. " if i % n_early_output_interval == 0 and i > 0:\n",
  231. " n_audio -= n_early_output\n",
  232. " n_audio_channels.append(n_audio) # audio channel after early output\n",
  233. "\n",
  234. "# in_layers\n",
  235. "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
  236. "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
  237. "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
  238. "# cond_layers\n",
  239. "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
  240. "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
  241. "# res_skip_layers\n",
  242. "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
  243. "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
  244. "# invertible convs\n",
  245. "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
  246. "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
  247. "# start\n",
  248. "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
  249. "print('MACs of start conv layers', starts / duration / 1e9)\n",
  250. "#end\n",
  251. "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
  252. "print('MACs of end conv layers', ends / duration / 1e9)\n",
  253. "# total\n",
  254. "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
  255. "print('Total number of MACs is', total / duration / 1e9)\n",
  256. "print('Reduction compared with WaveGlow', WG_total / total)"
  257. ],
  258. "execution_count": 0,
  259. "outputs": [
  260. {
  261. "output_type": "stream",
  262. "text": [
  263. "('MACs of in_layers', 1.1172758068965518)\n",
  264. "('MACs of cond_layers', 0.34711481379310344)\n",
  265. "('MACs of res_skip_layers', 0.5553837020689656)\n",
  266. "('MACs of invertible conv layers', 0.0502141351724138)\n",
  267. "('MACs of start conv layers', 0.029287812413793107)\n",
  268. "('MACs of end conv layers', 0.058575624827586215)\n",
  269. "('Total number of MACs is', 2.157851895172414)\n",
  270. "('Reduction compared with WaveGlow', 106)\n"
  271. ],
  272. "name": "stdout"
  273. }
  274. ]
  275. },
  276. {
  277. "cell_type": "markdown",
  278. "metadata": {
  279. "id": "aIgnX6Yi4BFu",
  280. "colab_type": "text"
  281. },
  282. "source": [
  283. "**SqueezeWave L=128, C=128**"
  284. ]
  285. },
  286. {
  287. "cell_type": "code",
  288. "metadata": {
  289. "id": "W-3Q5jW84F_t",
  290. "colab_type": "code",
  291. "outputId": "436038c3-f3f8-4989-eeec-eb59c154b183",
  292. "colab": {
  293. "base_uri": "https://localhost:8080/",
  294. "height": 153
  295. }
  296. },
  297. "source": [
  298. "L = 128 # audio length\n",
  299. "n_audio_channel_init = 128 # initial audio channel \n",
  300. "L_mel = 64 # mel-spectrogram length\n",
  301. "C_mel =80 # mel-spectrogram channel \n",
  302. "kernal_size = 3\n",
  303. "C_wn = 128 # input channel size of in_layer\n",
  304. "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
  305. "n_flows = 12\n",
  306. "n_layers = 8\n",
  307. "n_early_output = 16\n",
  308. "n_early_output_interval = 2\n",
  309. "duration = 0.725\n",
  310. "\n",
  311. "n_audio_channels = []\n",
  312. "n_audio = n_audio_channel_init\n",
  313. "for i in range(n_flows):\n",
  314. " if i % n_early_output_interval == 0 and i > 0:\n",
  315. " n_audio -= n_early_output\n",
  316. " n_audio_channels.append(n_audio) # audio channel after early output\n",
  317. "\n",
  318. "# in_layers\n",
  319. "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
  320. "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
  321. "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
  322. "# cond_layers\n",
  323. "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
  324. "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
  325. "# res_skip_layers\n",
  326. "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
  327. "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
  328. "# invertible convs\n",
  329. "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
  330. "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
  331. "# start\n",
  332. "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
  333. "print('MACs of start conv layers', starts / duration / 1e9)\n",
  334. "#end\n",
  335. "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
  336. "print('MACs of end conv layers', ends / duration / 1e9)\n",
  337. "# total\n",
  338. "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
  339. "print('Total number of MACs is', total / duration / 1e9)\n",
  340. "print('Reduction compared with WaveGlow', WG_total / total)"
  341. ],
  342. "execution_count": 0,
  343. "outputs": [
  344. {
  345. "output_type": "stream",
  346. "text": [
  347. "('MACs of in_layers', 0.5618921048275862)\n",
  348. "('MACs of cond_layers', 0.17355740689655172)\n",
  349. "('MACs of res_skip_layers', 0.2776918510344828)\n",
  350. "('MACs of invertible conv layers', 0.017988502068965517)\n",
  351. "('MACs of start conv layers', 0.011932071724137933)\n",
  352. "('MACs of end conv layers', 0.023864143448275865)\n",
  353. "('Total number of MACs is', 1.06692608)\n",
  354. "('Reduction compared with WaveGlow', 214)\n"
  355. ],
  356. "name": "stdout"
  357. }
  358. ]
  359. },
  360. {
  361. "cell_type": "markdown",
  362. "metadata": {
  363. "id": "1kWvIBWU4Vwm",
  364. "colab_type": "text"
  365. },
  366. "source": [
  367. "**SqueezeWave L=128, C=256**"
  368. ]
  369. },
  370. {
  371. "cell_type": "code",
  372. "metadata": {
  373. "id": "6YM2bkC14WWc",
  374. "colab_type": "code",
  375. "outputId": "b1fd3d03-0135-400e-cfbc-28746c8d0cf0",
  376. "colab": {
  377. "base_uri": "https://localhost:8080/",
  378. "height": 153
  379. }
  380. },
  381. "source": [
  382. "L = 128 # audio length\n",
  383. "n_audio_channel_init = 128 # initial audio channel \n",
  384. "L_mel = 64 # mel-spectrogram length\n",
  385. "C_mel =80 # mel-spectrogram channel \n",
  386. "kernal_size = 3\n",
  387. "C_wn = 256 # input channel size of in_layer\n",
  388. "C_wn_middle = C_wn * 2 # output channel size of in_layer and cond_layer\n",
  389. "n_flows = 12\n",
  390. "n_layers = 8\n",
  391. "n_early_output = 16\n",
  392. "n_early_output_interval = 2\n",
  393. "duration = 0.725\n",
  394. "\n",
  395. "n_audio_channels = []\n",
  396. "n_audio = n_audio_channel_init\n",
  397. "for i in range(n_flows):\n",
  398. " if i % n_early_output_interval == 0 and i > 0:\n",
  399. " n_audio -= n_early_output\n",
  400. " n_audio_channels.append(n_audio) # audio channel after early output\n",
  401. "\n",
  402. "# in_layers\n",
  403. "WN_in_layers = L * kernal_size * C_wn * n_layers * n_flows # depthwise\n",
  404. "WN_in_layers += L * C_wn * C_wn_middle * n_layers * n_flows # pointwise\n",
  405. "print('MACs of in_layers', WN_in_layers / duration / 1e9)\n",
  406. "# cond_layers\n",
  407. "WN_cond_layers = L_mel * C_mel * C_wn_middle * n_layers * n_flows\n",
  408. "print('MACs of cond_layers', WN_cond_layers / duration / 1e9)\n",
  409. "# res_skip_layers\n",
  410. "WN_res_layers = L * C_wn * C_wn * n_layers * n_flows\n",
  411. "print('MACs of res_skip_layers', WN_res_layers / duration / 1e9)\n",
  412. "# invertible convs\n",
  413. "inv1x1 = np.sum([n**2 * L for n in n_audio_channels])\n",
  414. "print('MACs of invertible conv layers', inv1x1 / duration / 1e9)\n",
  415. "# start\n",
  416. "starts = np.sum([n / 2 * C_wn * L for n in n_audio_channels])\n",
  417. "print('MACs of start conv layers', starts / duration / 1e9)\n",
  418. "#end\n",
  419. "ends = np.sum([C_wn * n * L for n in n_audio_channels])\n",
  420. "print('MACs of end conv layers', ends / duration / 1e9)\n",
  421. "# total\n",
  422. "total = WN_in_layers + WN_cond_layers + WN_res_layers + inv1x1 + starts + ends\n",
  423. "print('Total number of MACs is', total / duration / 1e9)\n",
  424. "print('Reduction compared with WaveGlow', WG_total / total)"
  425. ],
  426. "execution_count": 0,
  427. "outputs": [
  428. {
  429. "output_type": "stream",
  430. "text": [
  431. "('MACs of in_layers', 2.2345516137931036)\n",
  432. "('MACs of cond_layers', 0.34711481379310344)\n",
  433. "('MACs of res_skip_layers', 1.1107674041379312)\n",
  434. "('MACs of invertible conv layers', 0.017988502068965517)\n",
  435. "('MACs of start conv layers', 0.023864143448275865)\n",
  436. "('MACs of end conv layers', 0.04772828689655173)\n",
  437. "('Total number of MACs is', 3.7820147641379314)\n",
  438. "('Reduction compared with WaveGlow', 60)\n"
  439. ],
  440. "name": "stdout"
  441. }
  442. ]
  443. }
  444. ]
  445. }