PyTorch+Windows: возможно ли распараллеливание на нескольких графических процессорах?
У меня есть 2 графических процессора, но я не могу запустить простую официальную демонстрацию параллелизации PyTorch. Я использую Python 3.6.1, PyTorch 0.2.1+a4fc05a, Windows 7 64 бит.
Вот копия демо:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
# Parameters and DataLoaders
input_size = 5
output_size = 2
batch_size = 30
data_size = 100
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
rand_loader = DataLoader(dataset=RandomDataset(input_size, 100),
batch_size=batch_size, shuffle=True)
class Model(nn.Module):
# Our model
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.fc = nn.Linear(input_size, output_size)
def forward(self, input):
output = self.fc(input)
print(" In Model: input size", input.size(),
"output size", output.size())
return output
model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model, device_ids=[0,1]) # IF I COMMENT THIS LINE, EVERYTHING WORKS BUT WITHOUT PARALLELIZATION
if torch.cuda.is_available():
model.cuda()
for data in rand_loader:
if torch.cuda.is_available():
input_var = Variable(data.cuda())
else:
input_var = Variable(data)
output = model(input_var)
print("Outside: input size", input_var.size(),
"output_size", output.size())
И вот вывод и ошибка:
Let's use 2 GPUs!
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-1-a1634922f845> in <module>()
54 input_var = Variable(data)
55
---> 56 output = model(input_var)
57 print("Outside: input size", input_var.size(),
58 "output_size", output.size())
C:\Development\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
222 for hook in self._forward_pre_hooks.values():
223 hook(self, input)
--> 224 result = self.forward(*input, **kwargs)
225 for hook in self._forward_hooks.values():
226 hook_result = hook(self, input, result)
C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\data_parallel.py in forward(self, *inputs, **kwargs)
57 if len(self.device_ids) == 1:
58 return self.module(*inputs[0], **kwargs[0])
---> 59 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
60 outputs = self.parallel_apply(replicas, inputs, kwargs)
61 return self.gather(outputs, self.output_device)
C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\data_parallel.py in replicate(self, module, device_ids)
62
63 def replicate(self, module, device_ids):
---> 64 return replicate(module, device_ids)
65
66 def scatter(self, inputs, kwargs, device_ids):
C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\replicate.py in replicate(network, devices)
10 params = list(network.parameters())
11 param_indices = {param: idx for idx, param in enumerate(params)}
---> 12 param_copies = Broadcast(devices)(*params)
13 if len(params) > 0:
14 param_copies = [param_copies[i:i + len(params)]
C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\_functions.py in forward(self, *inputs)
17 self.num_inputs = len(inputs)
18 self.input_device = inputs[0].get_device()
---> 19 outputs = comm.broadcast_coalesced(inputs, self.target_gpus)
20 return tuple([t for tensors in outputs for t in tensors])
21
C:\Development\Anaconda3\lib\site-packages\torch\cuda\comm.py in broadcast_coalesced(tensors, devices, buffer_size)
52 outputs[0].extend(tensors)
53 for chunk in _take_tensors(tensors, buffer_size):
---> 54 results = broadcast(_flatten_tensors(chunk), devices)
55 # use the broadcasted tensors for the remaining devices
56 for dst, res in zip(outputs[1:], results[1:]):
C:\Development\Anaconda3\lib\site-packages\torch\cuda\comm.py in broadcast(tensor, devices)
17 corresponding to indices from ``devices``.
18 """
---> 19 if nccl.is_available([tensor]) and len(set(devices)) == len(devices):
20 tensors = [tensor]
21 for device in devices[1:]:
C:\Development\Anaconda3\lib\site-packages\torch\cuda\nccl.py in is_available(tensors)
33 devices.add(device)
34
---> 35 if _libnccl() is None:
36 warnings.warn('NCCL library not found. Check your LD_LIBRARY_PATH')
37 return False
C:\Development\Anaconda3\lib\site-packages\torch\cuda\nccl.py in
_libnccl()
13 global lib
14 if lib is None:
---> 15 lib = ctypes.pydll.LoadLibrary(None)
16 if hasattr(lib, 'ncclCommDestroy'):
17 lib.ncclCommDestroy.restype = None
C:\Development\Anaconda3\lib\ctypes\__init__.py in LoadLibrary(self, name)
424
425 def LoadLibrary(self, name):
--> 426 return self._dlltype(name)
427
428 cdll = LibraryLoader(CDLL)
C:\Development\Anaconda3\lib\ctypes\__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error)
346
347 if handle is None:
--> 348 self._handle = _dlopen(self._name, mode)
349 else:
350 self._handle = handle
TypeError: bad argument type for built-in operation
Если я прокомментирую строку
model = nn.DataParallel(модель, device_ids=[0,1])
все работает, но без распараллеливания.
Исключение возникает из-за вызова
ctypes.pydll.LoadLibrary(отсутствует)
Здесь есть какое-либо решение или обходной путь? Любая помощь будет оценена.