Адафактор от трансформеров обнимает лицо работает только с Трансфромерами - разве не работает с Реснетом и МАМЛ с выше?
Воспроизвести
Я использую алгоритм метаобучения MAML (с более высоким уровнем) с помощью resnet. Я вижу, что это вызывает проблемы в моем сценарии (сообщение об ошибке вставлено ниже). Разве Adafactor не предполагает работу с Resnets или другими моделями?
Шаги по воспроизведению поведения:
- запустите этот код: https://github.com/brando90/higher/blob/master/examples/maml-omniglot.py (у него уже есть adafactor)
- если это сработает, раскомментируйте строку resnet12 и пингуйте меня, пожалуйста
Ожидаемое поведение
Я ожидаю, что тренировка пройдет гладко, но вместо этого я получаю:
--------------------- META-TRAIN ------------------------
Starting training!
Traceback (most recent call last):
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 441, in <module>
main_resume_from_checkpoint(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 403, in main_resume_from_checkpoint
run_training(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 413, in run_training
meta_train_fixed_iterations(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/meta_learning/training/meta_training.py", line 233, in meta_train_fixed_iterations
args.outer_opt.step()
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 577, in step
update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 508, in _approx_sq_grad
return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
RuntimeError: mat1 must be a matrix, got 4-D tensor
полный вывод ошибок:
('PID', '25721')
('always_use_deterministic_algorithms', False)
('args_hardcoded_in_script', False)
('base_model_mode', 'resnet12_rsf')
('best_val_loss', inf)
('condor_jobid', -1)
('copy_initial_weights', False)
('current_logs_path', '/home/miranda9/data/logs/logs_Nov05_15-44-03_jobid_668')
('current_time', 'Nov30_08-42-53')
('data_path', 'miniimagenet')
('debug', False)
('debug_test', False)
('device', device(type='cuda'))
('epoch_num', -1)
('eval_iters', 2)
('experiment_name', 'debug')
('fo', False)
('force_log', True)
('githash', '9af491c')
('githash_long', '9af491ccd13fa88f4d07287f54305488ba4967fc')
('githash_short', '9af491c')
('gpu_name', 'NVIDIA GeForce GTX TITAN X')
('grad_clip_mode', None)
('grad_clip_rate', None)
('hostname', 'vision-02.cs.illinois.edu')
('inner_debug_eval', False)
('inner_debug_train', False)
('inner_lr', 0.1)
('it', 0)
('jobid', 10340)
('k_eval', 15)
('k_shots', 5)
('log_root', PosixPath('/home/miranda9/data/logs/logs_Nov30_08-42-53_jobid_10340'))
('log_to_wandb', True)
('log_train_freq', 200)
('log_val_freq', 200)
('logger', <uutils.logger.Logger object at 0x2b832f5eff70>)
('logging', True)
('mail_user', 'brando.science@gmail.com')
('master_port', '37126')
('meta_batch_size_eval', 2)
('meta_batch_size_train', 2)
('meta_learner', 'maml_fixed_inner_lr')
('metrics_as_dist', False)
('my_stdout_filepath', '/home/miranda9/data/logs/logs_Nov05_15-44-03_jobid_668/my_stdout.log')
('n_classes', 5)
('nb_inner_train_steps', 4)
('nccl', 2708)
('num_epochs', -1)
('num_its', 3)
('num_workers', 4)
('outer_debug', False)
('outer_lr', 0.001)
('path_to_checkpoint', PosixPath('/home/miranda9/data_folder_fall2020_spring2021/logs/nov_all_mini_imagenet_expts/logs_Nov05_15-44-03_jobid_668'))
('pin_memory', False)
('pw_path', '/home/miranda9/pw_app.config.json')
('rank', -1)
('run_name', 'debug (Adafactor) : args.jobid=10340')
('save_ckpt', True)
('seed', None)
('serial', False)
('show_layerwise_sims', False)
('sim_compute_parallel', False)
('slurm_array_task_id', -1)
('slurm_jobid', 10340)
('split', 'train')
('tb', True)
('track_higher_grads', True)
('train_iters', 500000)
('trainin_with_epochs', False)
('training_mode', 'iterations')
('wandb_entity', 'brando')
('wandb_group', 'experiment_debug')
('wandb_project', 'sl_vs_ml_iclr_workshop_paper')
------- Main Resume from Checkpoint --------
args.base_model=ResNet(
(layer1): Sequential(
(0): BasicBlock(
(conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer2): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(64, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer3): Sequential(
(0): BasicBlock(
(conv1): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(160, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer4): Sequential(
(0): BasicBlock(
(conv1): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(avgpool): AdaptiveAvgPool2d(output_size=1)
(dropout): Dropout(p=0.0, inplace=False)
(classifier): Linear(in_features=640, out_features=5, bias=True)
)
args.outer_opt=Adafactor (
Parameter Group 0
beta1: None
clip_threshold: 1.0
decay_rate: -0.8
eps: (1e-30, 0.001)
lr: None
relative_step: True
scale_parameter: True
warmup_init: True
weight_decay: 0.0
)
args.meta_learner=MAMLMetaLearner(
(base_model): ResNet(
(layer1): Sequential(
(0): BasicBlock(
(conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer2): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(64, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer3): Sequential(
(0): BasicBlock(
(conv1): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(160, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer4): Sequential(
(0): BasicBlock(
(conv1): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(avgpool): AdaptiveAvgPool2d(output_size=1)
(dropout): Dropout(p=0.0, inplace=False)
(classifier): Linear(in_features=640, out_features=5, bias=True)
)
)
args.scheduler=None
--------------------- META-TRAIN ------------------------
Starting training!
Traceback (most recent call last):
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 441, in <module>
main_resume_from_checkpoint(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 403, in main_resume_from_checkpoint
run_training(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 413, in run_training
meta_train_fixed_iterations(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/meta_learning/training/meta_training.py", line 233, in meta_train_fixed_iterations
args.outer_opt.step()
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 577, in step
update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 508, in _approx_sq_grad
return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
RuntimeError: mat1 must be a matrix, got 4-D tensor
связанные с:
- https://github.com/huggingface/transformers/issues/14574
- https://github.com/facebookresearch/higher/issues/124
- Адафактор от трансформеров обнимает лицо работает только с Трансфромерами - разве не работает с Реснетом и МАМЛ с выше?
- https://www.reddit.com/r/pytorch/comments/r5p2pk/adafactor_from_transformers_hugging_face_only/