PyTorch+Windows: возможно ли распараллеливание на нескольких графических процессорах?

У меня есть 2 графических процессора, но я не могу запустить простую официальную демонстрацию параллелизации PyTorch. Я использую Python 3.6.1, PyTorch 0.2.1+a4fc05a, Windows 7 64 бит.

Вот копия демо:

import torch
import torch.nn as nn
from torch.autograd import Variable
from import Dataset, DataLoader

# Parameters and DataLoaders
input_size = 5
output_size = 2

batch_size = 30
data_size = 100

class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length = torch.randn(length, size)

    def __getitem__(self, index):

    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, 100),
                         batch_size=batch_size, shuffle=True)

class Model(nn.Module):
    # Our model

    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, input):
        output = self.fc(input)
        print("  In Model: input size", input.size(),
              "output size", output.size())

        return output

model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  model = nn.DataParallel(model, device_ids=[0,1])  # IF I COMMENT THIS LINE, EVERYTHING WORKS BUT WITHOUT PARALLELIZATION

if torch.cuda.is_available():

for data in rand_loader:
    if torch.cuda.is_available():
        input_var = Variable(data.cuda())
        input_var = Variable(data)

    output = model(input_var)
    print("Outside: input size", input_var.size(),
          "output_size", output.size())

И вот вывод и ошибка:

Let's use 2 GPUs!
--------------------------------------------------------------------------- TypeError                                 Traceback (most recent call last) <ipython-input-1-a1634922f845> in <module>()
     54         input_var = Variable(data)
---> 56     output = model(input_var)
     57     print("Outside: input size", input_var.size(),
     58           "output_size", output.size())

C:\Development\Anaconda3\lib\site-packages\torch\nn\modules\ in __call__(self, *input, **kwargs)
    222         for hook in self._forward_pre_hooks.values():
    223             hook(self, input)
--> 224         result = self.forward(*input, **kwargs)
    225         for hook in self._forward_hooks.values():
    226             hook_result = hook(self, input, result)

C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\ in forward(self, *inputs, **kwargs)
     57         if len(self.device_ids) == 1:
     58             return self.module(*inputs[0], **kwargs[0])
---> 59         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
     60         outputs = self.parallel_apply(replicas, inputs, kwargs)
     61         return self.gather(outputs, self.output_device)

C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\ in replicate(self, module, device_ids)
     63     def replicate(self, module, device_ids):
---> 64         return replicate(module, device_ids)
     66     def scatter(self, inputs, kwargs, device_ids):

C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\ in replicate(network, devices)
     10     params = list(network.parameters())
     11     param_indices = {param: idx for idx, param in enumerate(params)}
---> 12     param_copies = Broadcast(devices)(*params)
     13     if len(params) > 0:
     14         param_copies = [param_copies[i:i + len(params)]

C:\Development\Anaconda3\lib\site-packages\torch\nn\parallel\ in forward(self, *inputs)
     17         self.num_inputs = len(inputs)
     18         self.input_device = inputs[0].get_device()
---> 19         outputs = comm.broadcast_coalesced(inputs, self.target_gpus)
     20         return tuple([t for tensors in outputs for t in tensors])

C:\Development\Anaconda3\lib\site-packages\torch\cuda\ in broadcast_coalesced(tensors, devices, buffer_size)
     52     outputs[0].extend(tensors)
     53     for chunk in _take_tensors(tensors, buffer_size):
---> 54         results = broadcast(_flatten_tensors(chunk), devices)
     55         # use the broadcasted tensors for the remaining devices
     56         for dst, res in zip(outputs[1:], results[1:]):

C:\Development\Anaconda3\lib\site-packages\torch\cuda\ in broadcast(tensor, devices)
     17         corresponding to indices from ``devices``.
     18     """
---> 19     if nccl.is_available([tensor]) and len(set(devices)) == len(devices):
     20         tensors = [tensor]
     21         for device in devices[1:]:

C:\Development\Anaconda3\lib\site-packages\torch\cuda\ in is_available(tensors)
     33         devices.add(device)
---> 35     if _libnccl() is None:
     36         warnings.warn('NCCL library not found. Check your LD_LIBRARY_PATH')
     37         return False

C:\Development\Anaconda3\lib\site-packages\torch\cuda\ in
     13     global lib
     14     if lib is None:
---> 15         lib = ctypes.pydll.LoadLibrary(None)
     16         if hasattr(lib, 'ncclCommDestroy'):
     17             lib.ncclCommDestroy.restype = None

C:\Development\Anaconda3\lib\ctypes\ in LoadLibrary(self, name)
    425     def LoadLibrary(self, name):
--> 426         return self._dlltype(name)
    428 cdll = LibraryLoader(CDLL)

C:\Development\Anaconda3\lib\ctypes\ in __init__(self, name, mode, handle, use_errno, use_last_error)
    347         if handle is None:
--> 348             self._handle = _dlopen(self._name, mode)
    349         else:
    350             self._handle = handle

TypeError: bad argument type for built-in operation

Если я прокомментирую строку

model = nn.DataParallel(модель, device_ids=[0,1])

все работает, но без распараллеливания.

Исключение возникает из-за вызова


Здесь есть какое-либо решение или обходной путь? Любая помощь будет оценена.

0 ответов

Другие вопросы по тегам