diff --git a/loss_scaler.py b/loss_scaler.py index c7dfa13..0662a60 100644 --- a/loss_scaler.py +++ b/loss_scaler.py @@ -51,11 +51,10 @@ class DynamicLossScaler: # `x` is a torch.Tensor def _has_inf_or_nan(x): - inf_count = torch.sum(x.abs() == float('inf')) - if inf_count > 0: + cpu_sum = float(x.float().sum()) + if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: return True - nan_count = torch.sum(x != x) - return nan_count > 0 + return False # `overflow` is boolean indicating whether we overflowed in gradient def update_scale(self, overflow): diff --git a/train.py b/train.py index ee01b07..786d1dd 100644 --- a/train.py +++ b/train.py @@ -2,6 +2,7 @@ import os import time import argparse import math +from numpy import finfo import torch from distributed import DistributedDataParallel @@ -77,7 +78,9 @@ def prepare_directories_and_logger(output_directory, log_directory, rank): def load_model(hparams): model = Tacotron2(hparams).cuda() - model = batchnorm_to_float(model.half()) if hparams.fp16_run else model + if hparams.fp16_run: + model = batchnorm_to_float(model.half()) + model.decoder.attention_layer.score_mask_value = float(finfo('float16').min) if hparams.distributed_run: model = DistributedDataParallel(model) @@ -276,7 +279,7 @@ if __name__ == '__main__': torch.backends.cudnn.benchmark = hparams.cudnn_benchmark print("FP16 Run:", hparams.fp16_run) - print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling) + print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling) print("Distributed Run:", hparams.distributed_run) print("cuDNN Enabled:", hparams.cudnn_enabled) print("cuDNN Benchmark:", hparams.cudnn_benchmark)