PyCUDA 正确使用结构

PyCUDA using struct properly

我正在尝试在我的 Pycuda 代码中实现一个结构,但我遇到了越界错误。我尝试按照 this 教程进行操作,但无法使其适用于我的案例。

这个问题很可能是由于指针使用不当造成的,例如本教程显示必须分配指针 memsize 而不是数据 memsize。希望这里有人能给我一些见解...

示例代码:

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import numpy as np
import pycuda.driver as cuda
import pycuda.tools as tools
import pycuda.autoinit

from mako.template import Template
from pycuda.compiler import SourceModule

src_template = Template(
"""
    struct Dist {
        %for s in xrange(ns):
        float *dist${s};
        %endfor
    };

    // return linear index based on x,y coordinate
    __device__ int get_index(int xcoord, int ycoord)
    {
        return ycoord + xcoord * ${ny};
    };

    __global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
    {
        int idx;
        float dens, velx, vely, vv, ev;

        for (int y = threadIdx.x + blockIdx.x * blockDim.x; 
                 y < ${ny}; 
                 y += blockDim.x * gridDim.x)   
        {   
            for (int x = threadIdx.y + blockIdx.y * blockDim.y; 
                 x < ${nx}; 
                 x += blockDim.y * gridDim.y) 
            {
                if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
                {
                    idx = get_index(x,y);
                    dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
                    vv = velx*velx + vely*vely;

                    %for s in xrange(ns):
                    // s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
                    ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
                    ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
                    %endfor
                }
            }
        }
    }
"""
)

class channelFlow:
    # initialize channelFlow
    def __init__(self, nx, ny):
        self.nx, self.ny = nx, ny

        max_threads_per_block = tools.DeviceData().max_threads
        self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1)  # threads per block
        self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid

        self.ns = 9
        self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
        self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
        self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])

        self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
                     'w': self.w, 'ex': self.ex, 'ey': self.ey
                     }

        dtype = np.float32
        self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
        self.rho  = np.zeros([self.nx,self.ny]).astype(dtype)
        self.ux   = np.zeros([self.nx,self.ny]).astype(dtype)
        self.uy   = np.zeros([self.nx,self.ny]).astype(dtype)

        self.ftmp_gpu = cuda.mem_alloc(self.ftmp.nbytes)
        self.rho_gpu  = cuda.mem_alloc(self.rho.nbytes)
        self.ux_gpu   = cuda.mem_alloc(self.ux.nbytes)
        self.uy_gpu   = cuda.mem_alloc(self.uy.nbytes)

    def run(self):
        src = src_template.render(**self.ctx)
        code = SourceModule(src)
        initialize  = code.get_function('initialize')

        self.rho[:,:] = 1.
        self.ux[:,:] = 0.
        self.uy[:,:] = 0.

        cuda.memcpy_htod(self.rho_gpu, self.rho)
        cuda.memcpy_htod(self.ux_gpu, self.ux)
        cuda.memcpy_htod(self.uy_gpu, self.uy)

        initialize(
            self.rho_gpu, self.ux_gpu, self.uy_gpu,
            self.ftmp_gpu, 
            block=self.blocksize, grid=self.gridsize
            )

if __name__ == "__main__":
    sim = channelFlow(64,64); sim.run()

我能够使用可用的 GPUStruct python 模块 here 并通过修复指针的不当使用,在 pycuda 中正确实现数组结构,例如:

ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);

已更改为:

float *ftmp${s}_ptr = ftmp->dist${s};
ftmp${s}_ptr[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);

显示 GPUStruct 实现细节的修订代码:

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import numpy as np
import pycuda.driver as cuda
import pycuda.tools as tools
import pycuda.autoinit

from gpu_struct import GPUStruct
from mako.template import Template
from pycuda.compiler import SourceModule

src_template = Template(
"""
    struct Dist {
        %for s in xrange(ns):
        float *dist${s};
        %endfor
    };

    // return linear index based on x,y coordinate
    __device__ int get_index(int xcoord, int ycoord)
    {
        return ycoord + xcoord * ${ny};
    };

    __global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
    {
        int idx;
        float dens, velx, vely, vv, ev;

        for (int y = threadIdx.x + blockIdx.x * blockDim.x; 
                 y < ${ny}; 
                 y += blockDim.x * gridDim.x)   
        {   
            for (int x = threadIdx.y + blockIdx.y * blockDim.y; 
                 x < ${nx}; 
                 x += blockDim.y * gridDim.y) 
            {
                if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
                {
                    idx = get_index(x,y);
                    dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
                    vv = velx*velx + vely*vely;

                    %for s in xrange(ns):
                    // s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
                    float *ftmp${s}_ptr1 = ftmp->dist${s};
                    ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
                    ftmp${s}_ptr1[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);                   
                    %endfor
                }
            }
        }
    }
"""
)

class channelFlow:
    # initialize channelFlow
    def __init__(self, nx, ny):
        self.nx, self.ny = nx, ny

        max_threads_per_block = tools.DeviceData().max_threads
        self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1)  # threads per block
        self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid

        self.ns = 9
        self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
        self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
        self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])

        self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
                     'w': self.w, 'ex': self.ex, 'ey': self.ey
                     }

        dtype = np.float32
        self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
        self.rho  = np.zeros([self.nx,self.ny]).astype(dtype)
        self.ux   = np.zeros([self.nx,self.ny]).astype(dtype)
        self.uy   = np.zeros([self.nx,self.ny]).astype(dtype)

        self.ftmp_gpu = GPUStruct([
            (np.float32,'*dist0', self.ftmp[:,:,0]),
            (np.float32,'*dist1', self.ftmp[:,:,1]),
            (np.float32,'*dist2', self.ftmp[:,:,2]),
            (np.float32,'*dist3', self.ftmp[:,:,3]),
            (np.float32,'*dist4', self.ftmp[:,:,4]),
            (np.float32,'*dist5', self.ftmp[:,:,5]),
            (np.float32,'*dist6', self.ftmp[:,:,6]),
            (np.float32,'*dist7', self.ftmp[:,:,7]),
            (np.float32,'*dist8', self.ftmp[:,:,8])
            ])
        self.rho_gpu  = cuda.mem_alloc(self.rho.nbytes)
        self.ux_gpu   = cuda.mem_alloc(self.ux.nbytes)
        self.uy_gpu   = cuda.mem_alloc(self.uy.nbytes)

    def run(self):
        src = src_template.render(**self.ctx)
        code = SourceModule(src)
        initialize  = code.get_function('initialize')

        self.rho[:,:] = 1.
        self.ux[:,:] = 0.
        self.uy[:,:] = 0.

        self.ftmp_gpu.copy_to_gpu()
        cuda.memcpy_htod(self.rho_gpu, self.rho)
        cuda.memcpy_htod(self.ux_gpu, self.ux)
        cuda.memcpy_htod(self.uy_gpu, self.uy)

        initialize(
            self.rho_gpu, self.ux_gpu, self.uy_gpu,
            self.ftmp_gpu.get_ptr(), 
            block=self.blocksize, grid=self.gridsize
            )

        self.dens = np.zeros_like(self.rho)
        cuda.memcpy_dtoh(self.dens, self.rho_gpu)
        print self.dens

if __name__ == "__main__":
    sim = channelFlow(64,64); sim.run()