2017-08-24 157 views
1

我在我的代碼中做了一些修改,以便它不使用DataParallelDistributedDataParallel。代碼如下:PyTorch給cuda運行時錯誤

import argparse 
import os 
import shutil 
import time 

import torch 
import torch.nn as nn 
import torch.nn.parallel 
import torch.backends.cudnn as cudnn 
import torch.distributed as dist 
import torch.optim 
import torch.utils.data 
import torch.utils.data.distributed 
import torchvision.transforms as transforms 
import torchvision.datasets as datasets 
import torchvision.models as models 

model_names = sorted(name for name in models.__dict__ 
    if name.islower() and not name.startswith("__") 
    and callable(models.__dict__[name])) 

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 
parser.add_argument('data', metavar='DIR', 
        help='path to dataset') 
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', 
        choices=model_names, 
        help='model architecture: ' + 
         ' | '.join(model_names) + 
         ' (default: resnet18)') 
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 
        help='number of data loading workers (default: 4)') 
parser.add_argument('--epochs', default=90, type=int, metavar='N', 
        help='number of total epochs to run') 
parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 
        help='manual epoch number (useful on restarts)') 
parser.add_argument('-b', '--batch-size', default=256, type=int, 
        metavar='N', help='mini-batch size (default: 256)') 
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 
        metavar='LR', help='initial learning rate') 
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 
        help='momentum') 
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 
        metavar='W', help='weight decay (default: 1e-4)') 
parser.add_argument('--print-freq', '-p', default=10, type=int, 
        metavar='N', help='print frequency (default: 10)') 
parser.add_argument('--resume', default='', type=str, metavar='PATH', 
        help='path to latest checkpoint (default: none)') 
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 
        help='evaluate model on validation set') 
parser.add_argument('--pretrained', dest='pretrained', action='store_true', 
        help='use pre-trained model') 
parser.add_argument('--world-size', default=1, type=int, 
        help='number of distributed processes') 
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 
        help='url used to set up distributed training') 
parser.add_argument('--dist-backend', default='gloo', type=str, 
        help='distributed backend') 

best_prec1 = 0 


def main(): 
    global args, best_prec1 
    args = parser.parse_args() 

    args.distributed = args.world_size > 1 

    if args.distributed: 
     dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 
           world_size=args.world_size) 

    # create model 
    if args.pretrained: 
     print("=> using pre-trained model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch](pretrained=True) 
    else: 
     print("=> creating model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch]() 

    if not args.distributed: 
     if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 
      #model.features = torch.nn.DataParallel(model.features) 
      model.cuda() 
     #else: 
      #model = torch.nn.DataParallel(model).cuda() 
    else: 
     model.cuda() 
     #model = torch.nn.parallel.DistributedDataParallel(model) 

    # define loss function (criterion) and optimizer 
    criterion = nn.CrossEntropyLoss().cuda() 

    optimizer = torch.optim.SGD(model.parameters(), args.lr, 
           momentum=args.momentum, 
           weight_decay=args.weight_decay) 

    # optionally resume from a checkpoint 
    if args.resume: 
     if os.path.isfile(args.resume): 
      print("=> loading checkpoint '{}'".format(args.resume)) 
      checkpoint = torch.load(args.resume) 
      args.start_epoch = checkpoint['epoch'] 
      best_prec1 = checkpoint['best_prec1'] 
      model.load_state_dict(checkpoint['state_dict']) 
      optimizer.load_state_dict(checkpoint['optimizer']) 
      print("=> loaded checkpoint '{}' (epoch {})" 
        .format(args.resume, checkpoint['epoch'])) 
     else: 
      print("=> no checkpoint found at '{}'".format(args.resume)) 

    cudnn.benchmark = True 

    # Data loading code 
    traindir = os.path.join(args.data, 'train') 
    valdir = os.path.join(args.data, 'val') 
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 
            std=[0.229, 0.224, 0.225]) 

    train_dataset = datasets.ImageFolder(
     traindir, 
     transforms.Compose([ 
      transforms.RandomSizedCrop(224), 
      transforms.RandomHorizontalFlip(), 
      transforms.ToTensor(), 
      normalize, 
     ])) 

    if args.distributed: 
     train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 
    else: 
     train_sampler = None 

    train_loader = torch.utils.data.DataLoader(
     train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 
     num_workers=args.workers, pin_memory=True, sampler=train_sampler) 

    val_loader = torch.utils.data.DataLoader(
     datasets.ImageFolder(valdir, transforms.Compose([ 
      transforms.Scale(256), 
      transforms.CenterCrop(224), 
      transforms.ToTensor(), 
      normalize, 
     ])), 
     batch_size=args.batch_size, shuffle=False, 
     num_workers=args.workers, pin_memory=True) 

    if args.evaluate: 
     validate(val_loader, model, criterion) 
     return 

    for epoch in range(args.start_epoch, args.epochs): 
     if args.distributed: 
      train_sampler.set_epoch(epoch) 
     adjust_learning_rate(optimizer, epoch) 

     # train for one epoch 
     train(train_loader, model, criterion, optimizer, epoch) 

     # evaluate on validation set 
     prec1 = validate(val_loader, model, criterion) 

     # remember best [email protected] and save checkpoint 
     is_best = prec1 > best_prec1 
     best_prec1 = max(prec1, best_prec1) 
     save_checkpoint({ 
      'epoch': epoch + 1, 
      'arch': args.arch, 
      'state_dict': model.state_dict(), 
      'best_prec1': best_prec1, 
      'optimizer' : optimizer.state_dict(), 
     }, is_best) 


def train(train_loader, model, criterion, optimizer, epoch): 
    batch_time = AverageMeter() 
    data_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to train mode 
    model.train() 

    end = time.time() 
    for i, (input, target) in enumerate(train_loader): 
     # measure data loading time 
     data_time.update(time.time() - end) 

     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input) 
     target_var = torch.autograd.Variable(target) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # compute gradient and do SGD step 
     optimizer.zero_grad() 
     loss.backward() 
     optimizer.step() 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Epoch: [{0}][{1}/{2}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        epoch, i, len(train_loader), batch_time=batch_time, 
        data_time=data_time, loss=losses, top1=top1, top5=top5)) 


def validate(val_loader, model, criterion): 
    batch_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to evaluate mode 
    model.eval() 

    end = time.time() 
    for i, (input, target) in enumerate(val_loader): 
     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input, volatile=True) 
     target_var = torch.autograd.Variable(target, volatile=True) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Test: [{0}/{1}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        i, len(val_loader), batch_time=batch_time, loss=losses, 
        top1=top1, top5=top5)) 

    print(' * [email protected] {top1.avg:.3f} [email protected] {top5.avg:.3f}' 
      .format(top1=top1, top5=top5)) 

    return top1.avg 


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 
    torch.save(state, filename) 
    if is_best: 
     shutil.copyfile(filename, 'model_best.pth.tar') 


class AverageMeter(object): 
    """Computes and stores the average and current value""" 
    def __init__(self): 
     self.reset() 

    def reset(self): 
     self.val = 0 
     self.avg = 0 
     self.sum = 0 
     self.count = 0 

    def update(self, val, n=1): 
     self.val = val 
     self.sum += val * n 
     self.count += n 
     self.avg = self.sum/self.count 


def adjust_learning_rate(optimizer, epoch): 
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 
    lr = args.lr * (0.1 ** (epoch // 30)) 
    for param_group in optimizer.param_groups: 
     param_group['lr'] = lr 


def accuracy(output, target, topk=(1,)): 
    """Computes the [email protected] for the specified values of k""" 
    maxk = max(topk) 
    batch_size = target.size(0) 

    _, pred = output.topk(maxk, 1, True, True) 
    pred = pred.t() 
    correct = pred.eq(target.view(1, -1).expand_as(pred)) 

    res = [] 
    for k in topk: 
     correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 
     res.append(correct_k.mul_(100.0/batch_size)) 
    return res 


if __name__ == '__main__': 
    main() 

而且,當我運行了一套具有alexnet neuralnet結構圖像的這段代碼,它給出了一個怪異的CUDA錯誤,主要內容如下:

=> creating model 'alexnet' 
THCudaCheck FAIL file=/pytorch/torch/lib/THC/THCGeneral.c line=70 error=30 : unknown error 
Traceback (most recent call last): 
    File "imagenet2.py", line 319, in <module> 
    main() 
    File "imagenet2.py", line 87, in main 
    model.cuda() 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in cuda 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 124, in _apply 
    param.data = fn(param.data) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in <lambda> 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/_utils.py", line 66, in _cuda 
    return new_type(self.size()).copy_(self, async) 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 266, in _lazy_new 
    _lazy_init() 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 85, in _lazy_init 
    torch._C._cuda_init() 
RuntimeError: cuda runtime error (30) : unknown error at /pytorch/torch/lib/THC/THCGeneral.c:70 

用於運行代碼的命令:python imagenet.py --world-size 1 --arch 'alexnet' <image_folder>

我哪裏出錯了?

PS:在AWS上運行g2.2xlarge Ubuntu實例。

的CUDA版本如下:

nvcc: NVIDIA (R) Cuda compiler driver 
Copyright (c) 2005-2016 NVIDIA Corporation 
Built on Tue_Jan_10_13:22:03_CST_2017 
Cuda compilation tools, release 8.0, V8.0.61 

回答

1
  1. CUDNN給出無用的錯誤消息。要進行調試,請使用net.cpu()在CPU上測試您的網絡,或者只需簡單地刪除net.cuda()即可。您必須對培訓,驗證和輸出變量進行相同的操作。

  2. 接縫問題是,您使用預先訓練過的AlexNet圖像的大小不同於224x224。根據文件,只要圖像尺寸至少爲224x224,它就應該可以工作。

  3. 這可能是由於pytorch的AlexNet實現中的硬編碼參數導致的張量整形問題。在vision/torchvision/models/alexnet.py在第44行,它說

x = x.view(x.size(0), 256 * 6 * 6) 

將其更改爲

x = x.view(x.size(0), -1) 

這應該允許它與不同的圖像大小的工作。

  1. 我將此修改subbmitted到github存儲庫,但我想它尚未更新。