From cd851585cb6f96128fac5b590e3ab4aa1ab59b95 Mon Sep 17 00:00:00 2001
From: Rafael Valle <rvalle@dhcp-172-20-232-236.nvidia.com>
Date: Tue, 15 May 2018 09:50:08 -0700
Subject: [PATCH 1/3] loss_scaler.py: patching loss scaler for compatibility
 with current pytorch

---
 loss_scaler.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/loss_scaler.py b/loss_scaler.py
index c7dfa13..0662a60 100644
--- a/loss_scaler.py
+++ b/loss_scaler.py
@@ -51,11 +51,10 @@ class DynamicLossScaler:
 
     # `x` is a torch.Tensor
     def _has_inf_or_nan(x):
-        inf_count = torch.sum(x.abs() == float('inf'))
-        if inf_count > 0:
+        cpu_sum = float(x.float().sum())
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
             return True
-        nan_count = torch.sum(x != x)
-        return nan_count > 0
+        return False
 
     # `overflow` is boolean indicating whether we overflowed in gradient
     def update_scale(self, overflow):

From 10710230173ee87048133f089a9f1355d22d1547 Mon Sep 17 00:00:00 2001
From: Rafael Valle <rvalle@dhcp-172-20-232-236.nvidia.com>
Date: Tue, 15 May 2018 09:50:56 -0700
Subject: [PATCH 2/3] train.py: patching score_mask_value formerly inf, not
 concrete value, for compatibility with pytorch

---
 train.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index ee01b07..b13f01f 100644
--- a/train.py
+++ b/train.py
@@ -2,6 +2,7 @@ import os
 import time
 import argparse
 import math
+from numpy import finfo
 
 import torch
 from distributed import DistributedDataParallel
@@ -77,7 +78,9 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
 
 def load_model(hparams):
     model = Tacotron2(hparams).cuda()
-    model = batchnorm_to_float(model.half()) if hparams.fp16_run else model
+    if hparams.fp16_run:
+        model = batchnorm_to_float(model.half())
+        model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
 
     if hparams.distributed_run:
         model = DistributedDataParallel(model)

From 27b1767cb25dad5d919ce1311b4bfdcefa64e13c Mon Sep 17 00:00:00 2001
From: Rafael Valle <rvalle@dhcp-172-20-232-236.nvidia.com>
Date: Tue, 15 May 2018 09:53:33 -0700
Subject: [PATCH 3/3] train.py: fixing typo

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index b13f01f..786d1dd 100644
--- a/train.py
+++ b/train.py
@@ -279,7 +279,7 @@ if __name__ == '__main__':
     torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
 
     print("FP16 Run:", hparams.fp16_run)
-    print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling)
+    print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
     print("Distributed Run:", hparams.distributed_run)
     print("cuDNN Enabled:", hparams.cudnn_enabled)
     print("cuDNN Benchmark:", hparams.cudnn_benchmark)