Waiting & Waiting: CUDA Note[4]="cast & align";


//----------------------------------------------------------------------------- 
__global__ void RCastTest0(unsigned int* rgTar, unsigned int* rgSrc)
{
    rgTar[0] = rgSrc[0];

    rgTar[1] = *(unsigned int*)((unsigned char*)rgSrc + 2);
}

//-----------------------------------------------------------------------------
void RCastTest()
{
    unsigned int  rgTestSrcHost[2] = {0x11223344, 0xaabbccdd};
    unsigned int  rgTestTarHost[2];
    unsigned int* rgTestSrcDevice = 0;
    unsigned int* rgTestTarDevice = 0;
    unsigned int* rgTestDevice = 0;

    ::cudaMalloc(&rgTestDevice, 4 * sizeof(unsigned int));

    rgTestTarDevice = rgTestDevice;
    rgTestSrcDevice = rgTestDevice + 2;

    ::cudaMemcpy(
        rgTestSrcDevice,
        rgTestSrcHost,
        2 * sizeof(unsigned int),
        cudaMemcpyHostToDevice);

    //--cast in cuda
    RCastTest0<<<1, 1>>>(rgTestTarDevice, rgTestSrcDevice);

    ::cudaMemcpy(
        rgTestTarHost,
        rgTestTarDevice,
        2 * sizeof(unsigned int),
        cudaMemcpyDeviceToHost);

    //--cast in cpu
    unsigned int dCasted = *(unsigned int*)((unsigned char*)(rgTestSrcHost) + 2);

    ::printf("CUDA (align)    : 0x%08X\n", rgTestTarHost[0]);
    ::printf("CUDA (un-align) : 0x%08X\n", rgTestTarHost[1]);
    ::printf("CPU  (un-align) : 0x%08X\n", dCasted);

    ::cudaFree(rgTestDevice);
}

A simple test of casting in cuda. The device memory is aligned when being allocated (to 256 byte). Everything is fine if you forget the optimized trick in C. For example, when make gray scale image fome a r8g8b8 one. You can get 3 u8 and calc the luminance, or get one u32 then calc with bit operation. This is OK in CPU, and should get better performance since access global memory is pretty slow in cuda. But you can`t do it like in cpu due to the result of this testing. When reading from global memory, cuda align internal reading address with sizeof casting type. (4 for u32, 2 for u16, etc.)

So there is a trick to do gray scale. If I just calc luma with reading 3 u8, every pixel need 4 global memory accessing (3 read, 1 write). But if I calc 4 pixels in one thread, I can read 3 u32 (and the first one is 4 byte aligned), the average read-write time would be 1.75 per pixel !

p.s.
output :
CUDA (align) : 0x11223344
CUDA (un-align) : 0x11223344
CPU (un-align) : 0xCCDD1122

Waiting & Waiting

Oct 30, 2009

CUDA Note[4]="cast & align";

No comments:

Category

Book Mark

ironhead

History

Friends