使用 cudaMemcpy2D 复制 1D 跨步数据的方法
Recipe to copy 1D strided data with cudaMemcpy2D
如果有两个连续的设备内存范围,则可以使用 cudaMemcpy
.
将内存从一个范围复制到另一个范围
double* source = ...
double* dest = ...
cudaMemcpy(dest, source, N, cudaMemcpyDeviceToDevice);
现在假设我想将源复制到目标中,但分别是每 2 或 3 个元素。
即dest[0] = source[0], dest[3] = source[2], dest[6] = source[4], ...
。
当然,一个简单的 cudaMemcpy
不能做到这一点。
直觉上,cudaMemcpy2D
应该可以完成这项工作,因为 "strided elements can be see as a column in a larger array"。
但是 cudaMemcpy2D
它有许多输入参数在这种情况下难以解释,例如 pitch
.
例如,我管理器使用 cudaMemcpy2D
重现两个步幅均为 1 的情况。
cudaMemcpy2D(dest, 1, source, 1, 1, n*sizeof(T), cudaMemcpyDeviceToHost);
但我无法弄清楚一般情况,dest_stride
和 source_stride
不同于 1.
有没有办法将跨步数据复制到 cudaMemcpy2D
跨步数据?
我必须按什么顺序放置有关布局的已知信息?即,根据两个步幅和 sizeof(T)
.
cudaMemcpy2D(dest, ??, source, ???, ????, ????, cudaMemcpyDeviceToHost);
是的,这是可以做到的。用代码比文字更容易说明:
#include <iostream>
int main()
{
const size_t swidth = 2;
const size_t sheight = 4;
size_t spitch = swidth * sizeof(int);
int source[swidth * sheight] = { 0, 1, 2, 3, 4, 5, 6, 7 };
const size_t dwidth = 3;
const size_t dheight = 4;
size_t dpitch = dwidth * sizeof(int);
int dest[dwidth * dheight] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
const size_t cwidth = 1 * sizeof(int);
const size_t cheight = 3;
int* source_d; cudaMalloc(&source_d, spitch * sheight);
cudaMemcpy(source_d, &source[0], spitch * sheight, cudaMemcpyHostToDevice);
cudaMemcpy2D(&dest[0], dpitch, source_d, spitch, cwidth, cheight, cudaMemcpyDeviceToHost);
for(int i=0; i < 12; i++) std::cout << i << " " << dest[i] << std::endl;
return 0;
}
这是做什么的:
$ nvcc -std=c++11 -arch=sm_52 -o strided_copy strided_copy.cu
$ cuda-memcheck ./strided_copy
========= CUDA-MEMCHECK
0 0
1 -1
2 -1
3 2
4 -1
5 -1
6 4
7 -1
8 -1
9 -1
10 -1
11 -1
========= ERROR SUMMARY: 0 errors
本质上,您是将宽度为 4 个字节(一个整数)、步幅为 8 个字节(两个整数)的复制到一个步幅为 12 个字节(三个整数)的目标中。我只复制了三个 rwos,以便清楚行参数的工作原理。调整复制元素的大小和步幅等。
这种跨步复制的通用函数大致如下所示:
void cudaMemcpyStrided(
void *dst, int dstStride,
void *src, int srcStride,
int numElements, int elementSize, int kind) {
int srcPitchInBytes = srcStride * elementSize;
int dstPitchInBytes = dstStride * elementSize;
int width = 1 * elementSize;
int height = numElements;
cudaMemcpy2D(
dst, dstPitchInBytes,
src, srcPitchInBytes,
width, height,
kind);
}
对于你的例子,它可以被称为
cudaMemcpyStrided(dest, 3, source, 2, 3, sizeof(double), cudaMemcpyDeviceToDevice);
"Roughly",因为我只是从我测试它的(Java/JCuda 基础)代码中即时翻译它:
import static jcuda.runtime.JCuda.cudaMemcpy2D;
import java.util.Arrays;
import java.util.Locale;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.runtime.cudaMemcpyKind;
public class JCudaStridedMemcopy {
public static void main(String[] args) {
int dstLength = 9;
int srcLength = 6;
int dstStride = 3;
int srcStride = 2;
int numElements = 3;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
dstLength = 9;
srcLength = 12;
dstStride = 3;
srcStride = 4;
numElements = 3;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
dstLength = 18;
srcLength = 12;
dstStride = 3;
srcStride = 2;
numElements = 6;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
}
private static void runExample(int dstLength, int dstStride, int srcLength, int srcStride, int numElements) {
double dst[] = new double[dstLength];
double src[] = new double[srcLength];
for (int i = 0; i < src.length; i++) {
src[i] = i;
}
cudaMemcpyStrided(dst, dstStride, src, srcStride, numElements);
System.out.println("Copy " + numElements + " elements");
System.out.println(" to array with length " + dstLength + ", with a stride of " + dstStride);
System.out.println(" from array with length " + srcLength + ", with a stride of " + srcStride);
System.out.println("");
System.out.println("Destination:");
System.out.println(toString2D(dst, dstStride));
System.out.println("Flat: " + Arrays.toString(dst));
System.out.println("");
System.out.println("Source:");
System.out.println(toString2D(src, srcStride));
System.out.println("Flat: " + Arrays.toString(src));
System.out.println("");
System.out.println("Done");
System.out.println("");
}
private static void cudaMemcpyStrided(double dst[], int dstStride, double src[], int srcStride, int numElements) {
long srcPitchInBytes = srcStride * Sizeof.DOUBLE;
long dstPitchInBytes = dstStride * Sizeof.DOUBLE;
long width = 1 * Sizeof.DOUBLE;
long height = numElements;
cudaMemcpy2D(
Pointer.to(dst), dstPitchInBytes,
Pointer.to(src), srcPitchInBytes,
width, height,
cudaMemcpyKind.cudaMemcpyHostToHost);
}
public static String toString2D(double[] a, long columns) {
String format = "%4.1f ";
;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < a.length; i++) {
if (i > 0 && i % columns == 0) {
sb.append("\n");
}
sb.append(String.format(Locale.ENGLISH, format, a[i]));
}
return sb.toString();
}
}
根据 examples/test 个案例,了解函数的作用,输出如下:
Copy 3 elements
to array with length 9, with a stride of 3
from array with length 6, with a stride of 2
Destination:
0.0 0.0 0.0
2.0 0.0 0.0
4.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0]
Source:
0.0 1.0
2.0 3.0
4.0 5.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Done
Copy 3 elements
to array with length 9, with a stride of 3
from array with length 12, with a stride of 4
Destination:
0.0 0.0 0.0
4.0 0.0 0.0
8.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 8.0, 0.0, 0.0]
Source:
0.0 1.0 2.0 3.0
4.0 5.0 6.0 7.0
8.0 9.0 10.0 11.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
Done
Copy 6 elements
to array with length 18, with a stride of 3
from array with length 12, with a stride of 2
Destination:
0.0 0.0 0.0
2.0 0.0 0.0
4.0 0.0 0.0
6.0 0.0 0.0
8.0 0.0 0.0
10.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0, 6.0, 0.0, 0.0, 8.0, 0.0, 0.0, 10.0, 0.0, 0.0]
Source:
0.0 1.0
2.0 3.0
4.0 5.0
6.0 7.0
8.0 9.0
10.0 11.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
Done
如果有两个连续的设备内存范围,则可以使用 cudaMemcpy
.
double* source = ...
double* dest = ...
cudaMemcpy(dest, source, N, cudaMemcpyDeviceToDevice);
现在假设我想将源复制到目标中,但分别是每 2 或 3 个元素。
即dest[0] = source[0], dest[3] = source[2], dest[6] = source[4], ...
。
当然,一个简单的 cudaMemcpy
不能做到这一点。
直觉上,cudaMemcpy2D
应该可以完成这项工作,因为 "strided elements can be see as a column in a larger array"。
但是 cudaMemcpy2D
它有许多输入参数在这种情况下难以解释,例如 pitch
.
例如,我管理器使用 cudaMemcpy2D
重现两个步幅均为 1 的情况。
cudaMemcpy2D(dest, 1, source, 1, 1, n*sizeof(T), cudaMemcpyDeviceToHost);
但我无法弄清楚一般情况,dest_stride
和 source_stride
不同于 1.
有没有办法将跨步数据复制到 cudaMemcpy2D
跨步数据?
我必须按什么顺序放置有关布局的已知信息?即,根据两个步幅和 sizeof(T)
.
cudaMemcpy2D(dest, ??, source, ???, ????, ????, cudaMemcpyDeviceToHost);
是的,这是可以做到的。用代码比文字更容易说明:
#include <iostream>
int main()
{
const size_t swidth = 2;
const size_t sheight = 4;
size_t spitch = swidth * sizeof(int);
int source[swidth * sheight] = { 0, 1, 2, 3, 4, 5, 6, 7 };
const size_t dwidth = 3;
const size_t dheight = 4;
size_t dpitch = dwidth * sizeof(int);
int dest[dwidth * dheight] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
const size_t cwidth = 1 * sizeof(int);
const size_t cheight = 3;
int* source_d; cudaMalloc(&source_d, spitch * sheight);
cudaMemcpy(source_d, &source[0], spitch * sheight, cudaMemcpyHostToDevice);
cudaMemcpy2D(&dest[0], dpitch, source_d, spitch, cwidth, cheight, cudaMemcpyDeviceToHost);
for(int i=0; i < 12; i++) std::cout << i << " " << dest[i] << std::endl;
return 0;
}
这是做什么的:
$ nvcc -std=c++11 -arch=sm_52 -o strided_copy strided_copy.cu
$ cuda-memcheck ./strided_copy
========= CUDA-MEMCHECK
0 0
1 -1
2 -1
3 2
4 -1
5 -1
6 4
7 -1
8 -1
9 -1
10 -1
11 -1
========= ERROR SUMMARY: 0 errors
本质上,您是将宽度为 4 个字节(一个整数)、步幅为 8 个字节(两个整数)的复制到一个步幅为 12 个字节(三个整数)的目标中。我只复制了三个 rwos,以便清楚行参数的工作原理。调整复制元素的大小和步幅等。
这种跨步复制的通用函数大致如下所示:
void cudaMemcpyStrided(
void *dst, int dstStride,
void *src, int srcStride,
int numElements, int elementSize, int kind) {
int srcPitchInBytes = srcStride * elementSize;
int dstPitchInBytes = dstStride * elementSize;
int width = 1 * elementSize;
int height = numElements;
cudaMemcpy2D(
dst, dstPitchInBytes,
src, srcPitchInBytes,
width, height,
kind);
}
对于你的例子,它可以被称为
cudaMemcpyStrided(dest, 3, source, 2, 3, sizeof(double), cudaMemcpyDeviceToDevice);
"Roughly",因为我只是从我测试它的(Java/JCuda 基础)代码中即时翻译它:
import static jcuda.runtime.JCuda.cudaMemcpy2D;
import java.util.Arrays;
import java.util.Locale;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.runtime.cudaMemcpyKind;
public class JCudaStridedMemcopy {
public static void main(String[] args) {
int dstLength = 9;
int srcLength = 6;
int dstStride = 3;
int srcStride = 2;
int numElements = 3;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
dstLength = 9;
srcLength = 12;
dstStride = 3;
srcStride = 4;
numElements = 3;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
dstLength = 18;
srcLength = 12;
dstStride = 3;
srcStride = 2;
numElements = 6;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
}
private static void runExample(int dstLength, int dstStride, int srcLength, int srcStride, int numElements) {
double dst[] = new double[dstLength];
double src[] = new double[srcLength];
for (int i = 0; i < src.length; i++) {
src[i] = i;
}
cudaMemcpyStrided(dst, dstStride, src, srcStride, numElements);
System.out.println("Copy " + numElements + " elements");
System.out.println(" to array with length " + dstLength + ", with a stride of " + dstStride);
System.out.println(" from array with length " + srcLength + ", with a stride of " + srcStride);
System.out.println("");
System.out.println("Destination:");
System.out.println(toString2D(dst, dstStride));
System.out.println("Flat: " + Arrays.toString(dst));
System.out.println("");
System.out.println("Source:");
System.out.println(toString2D(src, srcStride));
System.out.println("Flat: " + Arrays.toString(src));
System.out.println("");
System.out.println("Done");
System.out.println("");
}
private static void cudaMemcpyStrided(double dst[], int dstStride, double src[], int srcStride, int numElements) {
long srcPitchInBytes = srcStride * Sizeof.DOUBLE;
long dstPitchInBytes = dstStride * Sizeof.DOUBLE;
long width = 1 * Sizeof.DOUBLE;
long height = numElements;
cudaMemcpy2D(
Pointer.to(dst), dstPitchInBytes,
Pointer.to(src), srcPitchInBytes,
width, height,
cudaMemcpyKind.cudaMemcpyHostToHost);
}
public static String toString2D(double[] a, long columns) {
String format = "%4.1f ";
;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < a.length; i++) {
if (i > 0 && i % columns == 0) {
sb.append("\n");
}
sb.append(String.format(Locale.ENGLISH, format, a[i]));
}
return sb.toString();
}
}
根据 examples/test 个案例,了解函数的作用,输出如下:
Copy 3 elements
to array with length 9, with a stride of 3
from array with length 6, with a stride of 2
Destination:
0.0 0.0 0.0
2.0 0.0 0.0
4.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0]
Source:
0.0 1.0
2.0 3.0
4.0 5.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Done
Copy 3 elements
to array with length 9, with a stride of 3
from array with length 12, with a stride of 4
Destination:
0.0 0.0 0.0
4.0 0.0 0.0
8.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 8.0, 0.0, 0.0]
Source:
0.0 1.0 2.0 3.0
4.0 5.0 6.0 7.0
8.0 9.0 10.0 11.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
Done
Copy 6 elements
to array with length 18, with a stride of 3
from array with length 12, with a stride of 2
Destination:
0.0 0.0 0.0
2.0 0.0 0.0
4.0 0.0 0.0
6.0 0.0 0.0
8.0 0.0 0.0
10.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0, 6.0, 0.0, 0.0, 8.0, 0.0, 0.0, 10.0, 0.0, 0.0]
Source:
0.0 1.0
2.0 3.0
4.0 5.0
6.0 7.0
8.0 9.0
10.0 11.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
Done