My global function written in Cuda only run the last block

up vote
-2
down vote

favorite

Solved: Sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++` in the kernel function.

I call the kernel function like this

dim3 Dg(blockSize, blockSize, blockSize);

dim3 Db(8, 8, 8);

voxelize << < Dg, Db >> > ();

cudaDeviceSynchronize();

But I found that my program only solve the part of the problem, so I use printf() in my global function voxelize () like the following code

__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

The output showed only the last part of each dimension runned( that is, the blockIdx.x is always 5, only some of the blockIndex.z are changing from 0 to 5).But I don't understand why, is there anything wrong when I call this kernel function?
My computer is with the GTX1050Ti MaxQ and cuda 10.

After, I passed a pointer to the kernel to monitor the running times.

 int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));

 dim3 Dg(blockSize, blockSize, blockSize);

 dim3 Db(8, 8, 8);

 int* times = new int(0);

 int* gpu_times;

 cudaMalloc((void **)&gpu_times, sizeof(int));

 cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);

 voxelize << < Dg, Db >> > (gpu_times);

 cudaDeviceSynchronize();

 cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);

 std::cout << *times << std::endl;

the kernel is modified as

__global__ void voxelize(int* times){

    (*times)++;

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

the output is enter image description here
the output shows it runs 141 times, but in fact, the output should be far more than 69664

sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++`.

But why does printf() only output a part of the index as I described before?

edited Nov 9 at 14:01

asked Nov 9 at 11:05

Forsworn

1

What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09

sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30

add a comment |

up vote
-2
down vote

favorite

Solved: Sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++` in the kernel function.

I call the kernel function like this

dim3 Dg(blockSize, blockSize, blockSize);

dim3 Db(8, 8, 8);

voxelize << < Dg, Db >> > ();

cudaDeviceSynchronize();

But I found that my program only solve the part of the problem, so I use printf() in my global function voxelize () like the following code

__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

After, I passed a pointer to the kernel to monitor the running times.

 int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));

 dim3 Dg(blockSize, blockSize, blockSize);

 dim3 Db(8, 8, 8);

 int* times = new int(0);

 int* gpu_times;

 cudaMalloc((void **)&gpu_times, sizeof(int));

 cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);

 voxelize << < Dg, Db >> > (gpu_times);

 cudaDeviceSynchronize();

 cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);

 std::cout << *times << std::endl;

the kernel is modified as

__global__ void voxelize(int* times){

    (*times)++;

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

the output is enter image description here
the output shows it runs 141 times, but in fact, the output should be far more than 69664

sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++`.

But why does printf() only output a part of the index as I described before?

edited Nov 9 at 14:01

asked Nov 9 at 11:05

Forsworn

1

What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09

sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30

add a comment |

up vote
-2
down vote

favorite

Solved: Sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++` in the kernel function.

I call the kernel function like this

dim3 Dg(blockSize, blockSize, blockSize);

dim3 Db(8, 8, 8);

voxelize << < Dg, Db >> > ();

cudaDeviceSynchronize();

But I found that my program only solve the part of the problem, so I use printf() in my global function voxelize () like the following code

__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

After, I passed a pointer to the kernel to monitor the running times.

 int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));

 dim3 Dg(blockSize, blockSize, blockSize);

 dim3 Db(8, 8, 8);

 int* times = new int(0);

 int* gpu_times;

 cudaMalloc((void **)&gpu_times, sizeof(int));

 cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);

 voxelize << < Dg, Db >> > (gpu_times);

 cudaDeviceSynchronize();

 cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);

 std::cout << *times << std::endl;

the kernel is modified as

__global__ void voxelize(int* times){

    (*times)++;

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

the output is enter image description here
the output shows it runs 141 times, but in fact, the output should be far more than 69664

sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++`.

But why does printf() only output a part of the index as I described before?

edited Nov 9 at 14:01

asked Nov 9 at 11:05

Forsworn

Solved: Sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++` in the kernel function.

I call the kernel function like this

dim3 Dg(blockSize, blockSize, blockSize);

dim3 Db(8, 8, 8);

voxelize << < Dg, Db >> > ();

cudaDeviceSynchronize();

But I found that my program only solve the part of the problem, so I use printf() in my global function voxelize () like the following code

__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

After, I passed a pointer to the kernel to monitor the running times.

 int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));

 dim3 Dg(blockSize, blockSize, blockSize);

 dim3 Db(8, 8, 8);

 int* times = new int(0);

 int* gpu_times;

 cudaMalloc((void **)&gpu_times, sizeof(int));

 cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);

 voxelize << < Dg, Db >> > (gpu_times);

 cudaDeviceSynchronize();

 cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);

 std::cout << *times << std::endl;

the kernel is modified as

__global__ void voxelize(int* times){

    (*times)++;

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}

the output is enter image description here
the output shows it runs 141 times, but in fact, the output should be far more than 69664

sorry, it's my fault, I should use `atomicAdd(times,1);` instead of `*times++`.

But why does printf() only output a part of the index as I described before?

c++ c cuda

edited Nov 9 at 14:01

asked Nov 9 at 11:05

Forsworn

edited Nov 9 at 14:01

asked Nov 9 at 11:05

Forsworn

edited Nov 9 at 14:01

asked Nov 9 at 11:05

Forsworn

asked Nov 9 at 11:05

Forsworn

asked Nov 9 at 11:05

Forsworn

1

What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09

sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30

add a comment |

1

What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09

sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30

What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09

sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30

add a comment |

1 Answer
1

active

oldest

votes

up vote
0
down vote

accepted

For your printf problem

You need to call cudaDeviceSynchronize() (error checking omitted for clarity)and you also need cudaDeviceSetLimit(...) if you use a lot of printf (which is the case):

#include <stdio.h>



__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}



int main()

{

  // Increase device printf buffer to 50 MiB

  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);

  dim3 Dg(5, 5, 5);

  dim3 Db(8, 8, 8);

  voxelize<<<Dg, Db>>>();

  cudaDeviceSynchronize();



  return 0;

}

This will print something like:

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

[...]

You can then check it like this:

# This will keep one line per block and count them, so 5*5*5 == 125

$ ./a.out | sort | uniq | wc -l

125



# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000

$ ./a.out | wc -l

64000

For you count problem

You can't do that: (*times)++;. You'll have a concurrency problems. You need to use atomic functions.

edited Nov 17 at 11:50

answered Nov 9 at 11:29

Robin Thoni

796518

I called it in my real codes, the output I wanted is the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34

maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39

You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45

Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54

I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31

|
show 2 more comments

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53224524%2fmy-global-function-written-in-cuda-only-run-the-last-block%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
0
down vote

accepted

For your printf problem

You need to call cudaDeviceSynchronize() (error checking omitted for clarity)and you also need cudaDeviceSetLimit(...) if you use a lot of printf (which is the case):

#include <stdio.h>



__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}



int main()

{

  // Increase device printf buffer to 50 MiB

  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);

  dim3 Dg(5, 5, 5);

  dim3 Db(8, 8, 8);

  voxelize<<<Dg, Db>>>();

  cudaDeviceSynchronize();



  return 0;

}

This will print something like:

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

[...]

You can then check it like this:

# This will keep one line per block and count them, so 5*5*5 == 125

$ ./a.out | sort | uniq | wc -l

125



# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000

$ ./a.out | wc -l

64000

For you count problem

You can't do that: (*times)++;. You'll have a concurrency problems. You need to use atomic functions.

edited Nov 17 at 11:50

answered Nov 9 at 11:29

Robin Thoni

796518

I called it in my real codes, the output I wanted is the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34

maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39

You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45

Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54

I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31

|
show 2 more comments

up vote
0
down vote

accepted

For your printf problem

You need to call cudaDeviceSynchronize() (error checking omitted for clarity)and you also need cudaDeviceSetLimit(...) if you use a lot of printf (which is the case):

#include <stdio.h>



__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}



int main()

{

  // Increase device printf buffer to 50 MiB

  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);

  dim3 Dg(5, 5, 5);

  dim3 Db(8, 8, 8);

  voxelize<<<Dg, Db>>>();

  cudaDeviceSynchronize();



  return 0;

}

This will print something like:

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

[...]

You can then check it like this:

# This will keep one line per block and count them, so 5*5*5 == 125

$ ./a.out | sort | uniq | wc -l

125



# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000

$ ./a.out | wc -l

64000

For you count problem

You can't do that: (*times)++;. You'll have a concurrency problems. You need to use atomic functions.

edited Nov 17 at 11:50

answered Nov 9 at 11:29

Robin Thoni

796518

I called it in my real codes, the output I wanted is the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34

maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39

You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45

Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54

I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31

|
show 2 more comments

up vote
0
down vote

accepted

For your printf problem

You need to call cudaDeviceSynchronize() (error checking omitted for clarity)and you also need cudaDeviceSetLimit(...) if you use a lot of printf (which is the case):

#include <stdio.h>



__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}



int main()

{

  // Increase device printf buffer to 50 MiB

  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);

  dim3 Dg(5, 5, 5);

  dim3 Db(8, 8, 8);

  voxelize<<<Dg, Db>>>();

  cudaDeviceSynchronize();



  return 0;

}

This will print something like:

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

[...]

You can then check it like this:

# This will keep one line per block and count them, so 5*5*5 == 125

$ ./a.out | sort | uniq | wc -l

125



# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000

$ ./a.out | wc -l

64000

For you count problem

You can't do that: (*times)++;. You'll have a concurrency problems. You need to use atomic functions.

edited Nov 17 at 11:50

answered Nov 9 at 11:29

Robin Thoni

796518

For your printf problem

You need to call cudaDeviceSynchronize() (error checking omitted for clarity)and you also need cudaDeviceSetLimit(...) if you use a lot of printf (which is the case):

#include <stdio.h>



__global__ void voxelize(){

    printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);

    unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;

    unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;

    unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

    unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;

}



int main()

{

  // Increase device printf buffer to 50 MiB

  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);

  dim3 Dg(5, 5, 5);

  dim3 Db(8, 8, 8);

  voxelize<<<Dg, Db>>>();

  cudaDeviceSynchronize();



  return 0;

}

This will print something like:

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4

[...]

You can then check it like this:

# This will keep one line per block and count them, so 5*5*5 == 125

$ ./a.out | sort | uniq | wc -l

125



# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000

$ ./a.out | wc -l

64000

For you count problem

You can't do that: (*times)++;. You'll have a concurrency problems. You need to use atomic functions.

edited Nov 17 at 11:50

answered Nov 9 at 11:29

Robin Thoni

796518

edited Nov 17 at 11:50

answered Nov 9 at 11:29

Robin Thoni

796518

answered Nov 9 at 11:29

Robin Thoni

796518

answered Nov 9 at 11:29

Robin Thoni

796518

I called it in my real codes, the output I wanted is the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34

maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39

You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45

Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54

I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31

|
show 2 more comments

I called it in my real codes, the output I wanted is the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34

maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39

You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45

Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54

I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31

I called it in my real codes, the output I wanted is the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4 the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34

maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39

You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45

Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54

I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31

|
show 2 more comments

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Wsrtjtyk