My global function written in Cuda only run the last block
up vote
-2
down vote
favorite
Solved: Sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
in the kernel function.
I call the kernel function like this
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
voxelize << < Dg, Db >> > ();
cudaDeviceSynchronize();
But I found that my program only solve the part of the problem, so I use printf()
in my global function voxelize ()
like the following code
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
The output showed only the last part of each dimension runned( that is, the blockIdx.x is always 5, only some of the blockIndex.z are changing from 0 to 5).But I don't understand why, is there anything wrong when I call this kernel function?
My computer is with the GTX1050Ti MaxQ and cuda 10.
After, I passed a pointer to the kernel to monitor the running times.
int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
int* times = new int(0);
int* gpu_times;
cudaMalloc((void **)&gpu_times, sizeof(int));
cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);
voxelize << < Dg, Db >> > (gpu_times);
cudaDeviceSynchronize();
cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << *times << std::endl;
the kernel is modified as
__global__ void voxelize(int* times){
(*times)++;
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
the output is
the output shows it runs 141 times, but in fact, the output should be far more than 69664
sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
.
But why does printf()
only output a part of the index as I described before?
c++ c cuda
add a comment |
up vote
-2
down vote
favorite
Solved: Sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
in the kernel function.
I call the kernel function like this
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
voxelize << < Dg, Db >> > ();
cudaDeviceSynchronize();
But I found that my program only solve the part of the problem, so I use printf()
in my global function voxelize ()
like the following code
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
The output showed only the last part of each dimension runned( that is, the blockIdx.x is always 5, only some of the blockIndex.z are changing from 0 to 5).But I don't understand why, is there anything wrong when I call this kernel function?
My computer is with the GTX1050Ti MaxQ and cuda 10.
After, I passed a pointer to the kernel to monitor the running times.
int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
int* times = new int(0);
int* gpu_times;
cudaMalloc((void **)&gpu_times, sizeof(int));
cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);
voxelize << < Dg, Db >> > (gpu_times);
cudaDeviceSynchronize();
cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << *times << std::endl;
the kernel is modified as
__global__ void voxelize(int* times){
(*times)++;
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
the output is
the output shows it runs 141 times, but in fact, the output should be far more than 69664
sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
.
But why does printf()
only output a part of the index as I described before?
c++ c cuda
1
What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09
sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30
add a comment |
up vote
-2
down vote
favorite
up vote
-2
down vote
favorite
Solved: Sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
in the kernel function.
I call the kernel function like this
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
voxelize << < Dg, Db >> > ();
cudaDeviceSynchronize();
But I found that my program only solve the part of the problem, so I use printf()
in my global function voxelize ()
like the following code
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
The output showed only the last part of each dimension runned( that is, the blockIdx.x is always 5, only some of the blockIndex.z are changing from 0 to 5).But I don't understand why, is there anything wrong when I call this kernel function?
My computer is with the GTX1050Ti MaxQ and cuda 10.
After, I passed a pointer to the kernel to monitor the running times.
int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
int* times = new int(0);
int* gpu_times;
cudaMalloc((void **)&gpu_times, sizeof(int));
cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);
voxelize << < Dg, Db >> > (gpu_times);
cudaDeviceSynchronize();
cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << *times << std::endl;
the kernel is modified as
__global__ void voxelize(int* times){
(*times)++;
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
the output is
the output shows it runs 141 times, but in fact, the output should be far more than 69664
sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
.
But why does printf()
only output a part of the index as I described before?
c++ c cuda
Solved: Sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
in the kernel function.
I call the kernel function like this
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
voxelize << < Dg, Db >> > ();
cudaDeviceSynchronize();
But I found that my program only solve the part of the problem, so I use printf()
in my global function voxelize ()
like the following code
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
The output showed only the last part of each dimension runned( that is, the blockIdx.x is always 5, only some of the blockIndex.z are changing from 0 to 5).But I don't understand why, is there anything wrong when I call this kernel function?
My computer is with the GTX1050Ti MaxQ and cuda 10.
After, I passed a pointer to the kernel to monitor the running times.
int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
int* times = new int(0);
int* gpu_times;
cudaMalloc((void **)&gpu_times, sizeof(int));
cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);
voxelize << < Dg, Db >> > (gpu_times);
cudaDeviceSynchronize();
cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << *times << std::endl;
the kernel is modified as
__global__ void voxelize(int* times){
(*times)++;
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
the output is
the output shows it runs 141 times, but in fact, the output should be far more than 69664
sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
.
But why does printf()
only output a part of the index as I described before?
c++ c cuda
c++ c cuda
edited Nov 9 at 14:01
asked Nov 9 at 11:05
Forsworn
95
95
1
What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09
sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30
add a comment |
1
What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09
sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30
1
1
What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09
What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09
sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30
sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30
add a comment |
1 Answer
1
active
oldest
votes
up vote
0
down vote
accepted
For your printf problem
You need to call cudaDeviceSynchronize()
(error checking omitted for clarity)and you also need cudaDeviceSetLimit(...)
if you use a lot of printf (which is the case):
#include <stdio.h>
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
int main()
{
// Increase device printf buffer to 50 MiB
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);
dim3 Dg(5, 5, 5);
dim3 Db(8, 8, 8);
voxelize<<<Dg, Db>>>();
cudaDeviceSynchronize();
return 0;
}
This will print something like:
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
[...]
You can then check it like this:
# This will keep one line per block and count them, so 5*5*5 == 125
$ ./a.out | sort | uniq | wc -l
125
# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000
$ ./a.out | wc -l
64000
For you count problem
You can't do that: (*times)++;
. You'll have a concurrency problems. You need to use atomic functions.
I called it in my real codes, the output I wanted isthe thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34
maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39
You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45
Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54
I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31
|
show 2 more comments
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
0
down vote
accepted
For your printf problem
You need to call cudaDeviceSynchronize()
(error checking omitted for clarity)and you also need cudaDeviceSetLimit(...)
if you use a lot of printf (which is the case):
#include <stdio.h>
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
int main()
{
// Increase device printf buffer to 50 MiB
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);
dim3 Dg(5, 5, 5);
dim3 Db(8, 8, 8);
voxelize<<<Dg, Db>>>();
cudaDeviceSynchronize();
return 0;
}
This will print something like:
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
[...]
You can then check it like this:
# This will keep one line per block and count them, so 5*5*5 == 125
$ ./a.out | sort | uniq | wc -l
125
# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000
$ ./a.out | wc -l
64000
For you count problem
You can't do that: (*times)++;
. You'll have a concurrency problems. You need to use atomic functions.
I called it in my real codes, the output I wanted isthe thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34
maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39
You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45
Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54
I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31
|
show 2 more comments
up vote
0
down vote
accepted
For your printf problem
You need to call cudaDeviceSynchronize()
(error checking omitted for clarity)and you also need cudaDeviceSetLimit(...)
if you use a lot of printf (which is the case):
#include <stdio.h>
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
int main()
{
// Increase device printf buffer to 50 MiB
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);
dim3 Dg(5, 5, 5);
dim3 Db(8, 8, 8);
voxelize<<<Dg, Db>>>();
cudaDeviceSynchronize();
return 0;
}
This will print something like:
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
[...]
You can then check it like this:
# This will keep one line per block and count them, so 5*5*5 == 125
$ ./a.out | sort | uniq | wc -l
125
# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000
$ ./a.out | wc -l
64000
For you count problem
You can't do that: (*times)++;
. You'll have a concurrency problems. You need to use atomic functions.
I called it in my real codes, the output I wanted isthe thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34
maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39
You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45
Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54
I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31
|
show 2 more comments
up vote
0
down vote
accepted
up vote
0
down vote
accepted
For your printf problem
You need to call cudaDeviceSynchronize()
(error checking omitted for clarity)and you also need cudaDeviceSetLimit(...)
if you use a lot of printf (which is the case):
#include <stdio.h>
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
int main()
{
// Increase device printf buffer to 50 MiB
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);
dim3 Dg(5, 5, 5);
dim3 Db(8, 8, 8);
voxelize<<<Dg, Db>>>();
cudaDeviceSynchronize();
return 0;
}
This will print something like:
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
[...]
You can then check it like this:
# This will keep one line per block and count them, so 5*5*5 == 125
$ ./a.out | sort | uniq | wc -l
125
# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000
$ ./a.out | wc -l
64000
For you count problem
You can't do that: (*times)++;
. You'll have a concurrency problems. You need to use atomic functions.
For your printf problem
You need to call cudaDeviceSynchronize()
(error checking omitted for clarity)and you also need cudaDeviceSetLimit(...)
if you use a lot of printf (which is the case):
#include <stdio.h>
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %dn", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
int main()
{
// Increase device printf buffer to 50 MiB
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);
dim3 Dg(5, 5, 5);
dim3 Db(8, 8, 8);
voxelize<<<Dg, Db>>>();
cudaDeviceSynchronize();
return 0;
}
This will print something like:
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
[...]
You can then check it like this:
# This will keep one line per block and count them, so 5*5*5 == 125
$ ./a.out | sort | uniq | wc -l
125
# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000
$ ./a.out | wc -l
64000
For you count problem
You can't do that: (*times)++;
. You'll have a concurrency problems. You need to use atomic functions.
edited Nov 17 at 11:50
answered Nov 9 at 11:29
Robin Thoni
796518
796518
I called it in my real codes, the output I wanted isthe thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34
maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39
You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45
Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54
I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31
|
show 2 more comments
I called it in my real codes, the output I wanted isthe thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34
maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39
You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45
Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54
I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31
I called it in my real codes, the output I wanted is
the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34
I called it in my real codes, the output I wanted is
the thread blockIdx.x 0, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 1, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 3, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 4, blockIdx.y 3 blockIdx.z 4
– Forsworn
Nov 9 at 11:34
maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39
maybe I misunderstands the cuda thread? I think that the kernel function will run #threads times, and the index of each thread can be calculated by blockIdx,blockDim,threadIdx, etc
– Forsworn
Nov 9 at 11:39
You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45
You can refer to this gist to compute the thread index: gist.github.com/waltner/6888263df7bceaad9ffc8c1408f68e3c
– Robin Thoni
Nov 9 at 11:45
Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54
Thank you, Robin. Now I know it's the my algorithm of homework rather than my machine which is wrong......crying :( . But I'm still wondering why it only outputs a part of the indices as I described above?
– Forsworn
Nov 9 at 11:54
I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31
I don't really know, have you checked for CUDA errors?
– Robin Thoni
Nov 9 at 13:31
|
show 2 more comments
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53224524%2fmy-global-function-written-in-cuda-only-run-the-last-block%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
What do you mean by "The output showed only the last part of each dimension runned"?
– Matthieu Brucher
Nov 9 at 11:09
sorry, I don't describe it clearly, I have modified the question.
– Forsworn
Nov 9 at 11:30