CURAND_RNG_PSEUDO_XORWOW XORWOW

cuRAND

2015/07/22 GPGPU実践プログラミング

955



乱数の種の設定

curandStatus_t curandSetPseudoRandomGeneratorSeed

(curandGenerator_t generator, unsigned long long seed )



乱数の生成



一様乱数を生成

curandStatus_t curandGenerateUniform

(curandGenerator_t generator, float* outputPtr, size_t num )



コンパイルの際はオプションとして

–lcurand

を追加

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include <curand.h>

#define N 1024 int main(){

int i;

float *value, *value_d;

curandGenerator_t gen;

cudaMalloc((void**)&value_d,

N*sizeof(float));

curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);

curandGenerateUniform

(gen, value_d, N);

value

= (float *)malloc(N*sizeof(float));

cudaMemcpy(value, value_d,

N*sizeof(float), cudaMemcpyDeviceToHost);

for(i=0;i<N;i++){

printf("%d¥n",value[i]);

}

curandDestoryGenerator(gen);

cudaFree(value_d);

free(value);

return 0;

}

cuRAND サンプル

curand.cu

cuRAND

 GPU

から乱数を生成するデバイス関数を呼ぶ場合



ヘッダファイルのインクルード

 #include<curand_kernel.h>



乱数生成の初期化



device void curand_init(unsigned long long seed, unsigned long long subsequence, unsigned long long offset,

curandStateMRG32k3a_t* state )



乱数の生成



device float curand_uniform(curandStateMtgp32_t* state )

GPGPU実践プログラミング

957 2015/07/22

#include<stdio.h>

#include<stdlib.h>

#include<curand_kernel.h>

#define N (1024)

__global__ void random(float *value){

int i;

unsigned int seed;

curandState stat;

seed = 1;

curand_init(seed, 0, 0, &stat);

for(i=0;i<N;i++){

value[i] = curand_uniform(&stat);

} }

int main(){

int i;

float *value, *value_d;

cudaMalloc((void**)&value_d,

N*sizeof(float));

random<<<1,1>>>(value_d);

value = (float *)malloc

(N*sizeof(float));

cudaMemcpy(value, value_d, N*sizeof(float),

cudaMemcpyDeviceToHost);

for(i=0;i<N;i++){

printf("%f ¥n",value[i]);

}

cudaFree(value_d);

free(value);

return 0;

}

cuRAND サンプル

curand_kernel.cu

Thrust

2015/07/22 GPGPU実践プログラミング

959

 CUDA

用の並列アルゴリズムライブラリ

 C++

の標準テンプレートライブラリ

(STL)

とよく似た高水準なインタフェースを保有

 CUDA4.0

から

CUDA

本体に吸収

 https://developer.nvidia.com/thrust

#include<iostream>

#include<thrust/host_vector.h>

#include<thrust/device_vector.h>

#include<thrust/copy.h>

#include<thrust/sort.h>

int main(void) {

thrust::host_vector < float > host_vec(3);

thrust::device_vector < float > device_vec(3);

host_vec[0] = 1.1; host_vec[1] = 3.3; host_vec[2] = 2.2;

thrust::copy(host_vec.begin(), host_vec.end(), device_vec.begin());

thrust::sort(device_vec.begin(), device_vec.end());

thrust::copy(device_vec.begin(), device_vec.end(), host_vec.begin());

std::cout << host_vec[0] << std::endl;

std::cout << host_vec[1] << std::endl;

std::cout << host_vec[2] << std::endl;

return 0;

}

Thrust サンプル

thrust_sort.cu

Thrust サンプル

2015/07/22 GPGPU実践プログラミング

961



ヘッダのインクルード

 #include<thrust/host_vector.h>

 #include<thrust/device_vector.h>

 #include<thrust/copy.h>



メモリ確保

 host_vector CPU

にメモリを確保

 device_vector GPU

にメモリを確保

 CPU

側の配列の初期化と

GPU

へのコピー

 thrust::copy(host_vec.begin(), host_vec.end(),

device_vec.begin());

Thrust サンプル



データのソート

 thrust::sort(device_vec.begin(), device_vec.end());

 GPU

から読み戻し

 thrust::copy(device_vec.begin(),

device_vec.end(), host_vec.begin());

モンテカルロ法

2015/07/22 GPGPU実践プログラミング

963



乱数を用いて数値シミュレーションや数値計算を行う手法の総称



得られる解が必ずしも正しいとは保証されていない



円周率の計算



正方形の中に乱数を使って点を打ち，円の内側に入っている点を数えることで円の面積を求める



点の数が無限に多く，点の座標が重複しないなら面積（や円周率）が正しく求められる

モンテカルロ法による円周率計算



円の面積

 r ²



正方形の面積

4r ²



面積比

a=  r ² /4r ²



円周率

 =4a



円と正方形の面積比を点の数で近似

2 r

r

#include<stdio.h>

#include<stdlib.h>

#include<math.h>

#include<curand.h>

#define N 1024 int main(){

int i,inside;

float *x, *y, *x_d, *y_d;

float pi;

curandGenerator_t gen;

cudaMalloc((void**)&x_d, N*sizeof(float));

cudaMalloc((void**)&y_d, N*sizeof(float));

curandCreateGenerator(&gen,

CURAND_RNG_PSEUDO_DEFAULT);

curandGenerateUniform(gen, x_d, N); //x座標 curandGenerateUniform(gen, y_d, N); //y座標 x = (float *)malloc(N*sizeof(float));

y = (float *)malloc(N*sizeof(float));

cudaMemcpy(x, x_d, N*sizeof(float),

cudaMemcpyDeviceToHost);

cudaMemcpy(y, y_d, N*sizeof(float),

cudaMemcpyDeviceToHost);

inside=0;

for(i=0;i<N;i++){

if( (x[i]*x[i] + y[i]*y[i]) <=1.0f ) inside++;

}

pi = 4.0f*(float)inside/N;

printf("%f¥n",pi);

curandDestroyGenerator(gen);

cudaFree(x_d);

cudaFree(y_d);

free(x);

free(y);

return 0;

}

モンテカルロ法による円周率計算

2015/07/22 GPGPU実践プログラミング

965

montecarlo.cu

モンテカルロ法によって得られた円周率

点の数円周率相対誤差

2 2.000000 0.36338 2

2.500000 0.204225 2

⁴

3.000000 0.04507 2

⁵

3.125000 0.005282 2

⁶

3.062500 0.025176 2

⁷

3.031250 0.035123 2

⁸

2.984375 0.050044 2

⁹

3.164062 0.007152 2

¹⁰

3.171875 0.009639

点の数円周率相対誤差

2

¹¹

3.160156 0.005909

2

¹²

3.171875 0.009639

2

¹³

3.133301 0.002639

2

¹⁴

3.157227 0.004976

2

¹⁵

3.14917 0.002412

2

¹⁶

3.152649 0.003519

2

¹⁷

3.152374 0.003432

2

¹⁸

3.147675 0.001936

2

¹⁹

3.142609 0.000323

2

²⁰

3.142635 0.000332

モンテカルロ法によって得られた円周率

2015/07/22 GPGPU実践プログラミング

967

サンプル数

N

相対誤差

O(N

^-0.5

)

その他 GPU 向けライブラリ

ドキュメント内今回の内容 CUDA 付属のライブラリ cublas 行列ベクトル積, 行列行列積 cusperse 行列格納形式 cufft 余弦波の FFT curand モンテカルロ法による円周率計算 Thrust 913 (ページ 43-57)

cuRAND



curandStatus_t curandSetPseudoRandomGeneratorSeed

(curandGenerator_t generator, unsigned long long seed )





curandStatus_t curandGenerateUniform

(curandGenerator_t generator, float* outputPtr, size_t num )



–lcurand

cuRAND サンプル

curand.cu

cuRAND

 GPU



 #include<curand_kernel.h>



__device__ void curand_init(unsigned long long seed, unsigned long long subsequence, unsigned long long offset,

curandStateMRG32k3a_t* state )



__device__ float curand_uniform(curandStateMtgp32_t* state )

cuRAND サンプル

curand_kernel.cu

Thrust

 CUDA

 C++

(STL)

 CUDA4.0

CUDA

 https://developer.nvidia.com/thrust

Thrust サンプル

thrust_sort.cu

Thrust サンプル



 #include<thrust/host_vector.h>

 #include<thrust/device_vector.h>

 #include<thrust/copy.h>



 host_vector CPU

 device_vector GPU

 CPU

GPU

 thrust::copy(host_vec.begin(), host_vec.end(),

device_vec.begin());

Thrust サンプル



 thrust::sort(device_vec.begin(), device_vec.end());

 GPU

 thrust::copy(device_vec.begin(),

device_vec.end(), host_vec.begin());

モンテカルロ法











モンテカルロ法による円周率計算



 r 2



4r 2



a=  r 2 /4r 2



 =4a



2 r

r

モンテカルロ法による円周率計算

montecarlo.cu

モンテカルロ法によって得られた円周率

2

2.000000 0.36338 2

2.000000 0.36338 2

2.500000 0.204225 2

3.000000 0.04507 2

3.125000 0.005282 2

3.062500 0.025176 2

3.031250 0.035123 2

device void curand_init(unsigned long long seed, unsigned long long subsequence, unsigned long long offset,

device float curand_uniform(curandStateMtgp32_t* state )

 r ²

4r ²

a=  r ² /4r ²