为什么我给tensor设置cuda后实际运行时间还是cpu
Why is the actual runtime still cpu after I set cuda for tensor
我想对自定义函数进行gpu加速,下面是原函数:
import numpy as np
def py_cpu_nms(dets, thresh):
"""Pure Python NMS baseline."""
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]
return keep
它会使用 CPU 来计算,但速度不够快所以我想直接用 PyTorch 加速它,我将其转换为 Torch 实现:
import numpy as np
import torch
def py_cpu_nms(dets, thresh):
"""Pure Python NMS baseline."""
dets = torch.from_numpy(dets)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dets.cuda()
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
areas = ((x2 - x1 + 1) * (y2 - y1 + 1)).cuda()
order = torch.argsort(scores)
order = torch.flip(order, dims=[0])
keep = []
while order.size()[0] > 0:
i = order[0]
keep.append(i)
xx1 = torch.maximum(x1[i], x1[order[1:]]).cuda()
yy1 = torch.maximum(y1[i], y1[order[1:]]).cuda()
xx2 = torch.minimum(x2[i], x2[order[1:]]).cuda()
yy2 = torch.minimum(y2[i], y2[order[1:]]).cuda()
w = torch.maximum(torch.tensor(0.0), xx2 - xx1 + 1).cuda()
h = torch.maximum(torch.tensor(0.0), yy2 - yy1 + 1).cuda()
inter = (w * h).cuda()
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = torch.where(ovr <= thresh)[0].cuda()
order = order[inds + 1].cuda()
return keep
但实际上,所有的计算仍然使用CPU,有谁知道为什么?
cuda()
没有in-place。将函数的第 3 行更改为 dets = dets.cuda()
.
我想对自定义函数进行gpu加速,下面是原函数:
import numpy as np
def py_cpu_nms(dets, thresh):
"""Pure Python NMS baseline."""
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]
return keep
它会使用 CPU 来计算,但速度不够快所以我想直接用 PyTorch 加速它,我将其转换为 Torch 实现:
import numpy as np
import torch
def py_cpu_nms(dets, thresh):
"""Pure Python NMS baseline."""
dets = torch.from_numpy(dets)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dets.cuda()
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
areas = ((x2 - x1 + 1) * (y2 - y1 + 1)).cuda()
order = torch.argsort(scores)
order = torch.flip(order, dims=[0])
keep = []
while order.size()[0] > 0:
i = order[0]
keep.append(i)
xx1 = torch.maximum(x1[i], x1[order[1:]]).cuda()
yy1 = torch.maximum(y1[i], y1[order[1:]]).cuda()
xx2 = torch.minimum(x2[i], x2[order[1:]]).cuda()
yy2 = torch.minimum(y2[i], y2[order[1:]]).cuda()
w = torch.maximum(torch.tensor(0.0), xx2 - xx1 + 1).cuda()
h = torch.maximum(torch.tensor(0.0), yy2 - yy1 + 1).cuda()
inter = (w * h).cuda()
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = torch.where(ovr <= thresh)[0].cuda()
order = order[inds + 1].cuda()
return keep
但实际上,所有的计算仍然使用CPU,有谁知道为什么?
cuda()
没有in-place。将函数的第 3 行更改为 dets = dets.cuda()
.