diff --git a/distributed.py b/distributed.py
index 1dd5910..9076a73 100644
--- a/distributed.py
+++ b/distributed.py
@@ -1,6 +1,7 @@
 import torch
 import torch.distributed as dist
 from torch.nn.modules import Module
+from torch.autograd import Variable
 
 def _flatten_dense_tensors(tensors):
     """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
@@ -161,12 +162,12 @@ def apply_gradient_allreduce(module):
 
         for param in list(module.parameters()):
             def allreduce_hook(*unused):
-                param._execution_engine.queue_callback(allreduce_params)
+                Variable._execution_engine.queue_callback(allreduce_params)
             if param.requires_grad:
                 param.register_hook(allreduce_hook)
-   
+
         def set_needs_reduction(self, input, output):
             self.needs_reduction = True
-        
+
         module.register_forward_hook(set_needs_reduction)
         return module