Fork of https://github.com/alokprasad/fastspeech_squeezewave to also fix denoising in squeezewave
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

191 lines
7.5 KiB

  1. # We retain the copyright notice by NVIDIA from the original code. However, we
  2. # we reserve our rights on the modifications based on the original code.
  3. #
  4. # *****************************************************************************
  5. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. # * Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # * Redistributions in binary form must reproduce the above copyright
  12. # notice, this list of conditions and the following disclaimer in the
  13. # documentation and/or other materials provided with the distribution.
  14. # * Neither the name of the NVIDIA CORPORATION nor the
  15. # names of its contributors may be used to endorse or promote products
  16. # derived from this software without specific prior written permission.
  17. #
  18. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  19. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  20. # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  21. # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  22. # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  23. # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  24. # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  25. # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  27. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. #
  29. # *****************************************************************************
  30. import os
  31. import sys
  32. import time
  33. import subprocess
  34. import argparse
  35. import torch
  36. import torch.distributed as dist
  37. from torch.autograd import Variable
  38. def reduce_tensor(tensor, num_gpus):
  39. rt = tensor.clone()
  40. dist.all_reduce(rt, op=dist.reduce_op.SUM)
  41. # rt /= (num_gpus*2)
  42. rt /=num_gpus
  43. return rt
  44. def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
  45. assert torch.cuda.is_available(), "Distributed mode requires CUDA."
  46. print("Initializing Distributed")
  47. # Set cuda device so everything is done on the right GPU.
  48. torch.cuda.set_device(rank % torch.cuda.device_count())
  49. # os.environ['MASTER_ADDR'] = '172.31.44.232'
  50. # os.environ['MASTER_PORT'] = '58217'
  51. # Initialize distributed communication
  52. dist.init_process_group(dist_backend, init_method=dist_url,
  53. world_size=num_gpus, rank=rank,
  54. group_name=group_name)
  55. def _flatten_dense_tensors(tensors):
  56. """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
  57. same dense type.
  58. Since inputs are dense, the resulting tensor will be a concatenated 1D
  59. buffer. Element-wise operation on this buffer will be equivalent to
  60. operating individually.
  61. Arguments:
  62. tensors (Iterable[Tensor]): dense tensors to flatten.
  63. Returns:
  64. A contiguous 1D buffer containing input tensors.
  65. """
  66. if len(tensors) == 1:
  67. return tensors[0].contiguous().view(-1)
  68. flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
  69. return flat
  70. def _unflatten_dense_tensors(flat, tensors):
  71. """View a flat buffer using the sizes of tensors. Assume that tensors are of
  72. same dense type, and that flat is given by _flatten_dense_tensors.
  73. Arguments:
  74. flat (Tensor): flattened dense tensors to unflatten.
  75. tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
  76. unflatten flat.
  77. Returns:
  78. Unflattened dense tensors with sizes same as tensors and values from
  79. flat.
  80. """
  81. outputs = []
  82. offset = 0
  83. for tensor in tensors:
  84. numel = tensor.numel()
  85. outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
  86. offset += numel
  87. return tuple(outputs)
  88. def apply_gradient_allreduce(module):
  89. """
  90. Modifies existing model to do gradient allreduce, but doesn't change class
  91. so you don't need "module"
  92. """
  93. if not hasattr(dist, '_backend'):
  94. module.warn_on_half = True
  95. else:
  96. module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
  97. for p in module.state_dict().values():
  98. if not torch.is_tensor(p):
  99. continue
  100. dist.broadcast(p, 0)
  101. def allreduce_params():
  102. if(module.needs_reduction):
  103. module.needs_reduction = False
  104. buckets = {}
  105. for param in module.parameters():
  106. if param.requires_grad and param.grad is not None:
  107. tp = type(param.data)
  108. if tp not in buckets:
  109. buckets[tp] = []
  110. buckets[tp].append(param)
  111. if module.warn_on_half:
  112. if torch.cuda.HalfTensor in buckets:
  113. print("WARNING: gloo dist backend for half parameters may be extremely slow." +
  114. " It is recommended to use the NCCL backend in this case. This currently requires" +
  115. "PyTorch built from top of tree master.")
  116. module.warn_on_half = False
  117. for tp in buckets:
  118. bucket = buckets[tp]
  119. grads = [param.grad.data for param in bucket]
  120. coalesced = _flatten_dense_tensors(grads)
  121. dist.all_reduce(coalesced)
  122. coalesced /= dist.get_world_size()
  123. for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
  124. buf.copy_(synced)
  125. for param in list(module.parameters()):
  126. def allreduce_hook(*unused):
  127. Variable._execution_engine.queue_callback(allreduce_params)
  128. if param.requires_grad:
  129. param.register_hook(allreduce_hook)
  130. dir(param)
  131. def set_needs_reduction(self, input, output):
  132. self.needs_reduction = True
  133. module.register_forward_hook(set_needs_reduction)
  134. return module
  135. def main(config, stdout_dir, args_str):
  136. args_list = ['-u']
  137. args_list.append('train.py')
  138. args_list += args_str.split(' ') if len(args_str) > 0 else []
  139. args_list.append('--config={}'.format(config))
  140. num_gpus = torch.cuda.device_count()
  141. args_list.append('--num_gpus={}'.format(num_gpus))
  142. args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S")))
  143. if not os.path.isdir(stdout_dir):
  144. os.makedirs(stdout_dir)
  145. os.chmod(stdout_dir, 0o775)
  146. workers = []
  147. for i in range(num_gpus):
  148. args_list[-2] = '--rank={}'.format(i)
  149. stdout = None if i == 0 else open(
  150. os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w")
  151. print(args_list)
  152. p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout)
  153. workers.append(p)
  154. for p in workers:
  155. p.wait()
  156. if __name__ == '__main__':
  157. parser = argparse.ArgumentParser()
  158. parser.add_argument('-c', '--config', type=str, required=True,
  159. help='JSON file for configuration')
  160. parser.add_argument('-s', '--stdout_dir', type=str, default=".",
  161. help='directory to save stoud logs')
  162. parser.add_argument(
  163. '-a', '--args_str', type=str, default='',
  164. help='double quoted string with space separated key value pairs')
  165. args = parser.parse_args()
  166. main(args.config, args.stdout_dir, args.args_str)