IBM Minsky における
電力性能比検証報告書
(Deep Learning および HPC アプリケーション)
2017 年 1 月
Minsky の電力消費量の調査
(1) ディープラーニングデモプログラム(mnist)の実行
デモプログラム → 添付1
CPU 実行、シングル GPU 実行、2GPU 実行での計算時間と電力消費量を比較し た。
〇計算時間
CPU 実行 シングル GPU 実行 複数 GPU 実行
今回は 2GPU
計算時間(秒) 4220 74 40
〇電力消費量
約 10 秒ごとに消費電力を表示
(Total Watt:総消費電力、GPU Watt:総電力の内 GPU の消費電力) Time
Step
Chainer CPU Chainer 1GPU Chainer 2 GPU
Total Watt (うち GPU Watt) Total Watt (うち GPU Watt) Total Watt (うち 2GPU 合計 Watt)
10Sec 708W (55W) 744W (70W) 816W (75W) 20Sec 636W (55W) 744W (70W) 768W (85W) 30Sec 708W (50W) 744W (65W) 792W (75W) 40Sec 636W (50W) 744W (70W) 612W (55W) 50Sec 624W (55W) 756W (70W) - - 60Sec 720W (55W) 744W (65W) - - 70Sec 600W (55W) 720W (55W) - - 80Sec 636W (50W) 720W (55W) - - 90Sec 600W (55W) - - - - 100Sec 600W (55W) - - - - 110Sec 612W (50W) - - - - 120Sec 612W (55W) - - - - ・ ・ ・ ・ ・ ・ ・ ・ ・ - - - - - - - - - - - - ※GPU 計算では GPU 消費電力以上に総電力が増加している。 100 倍以上のスピード
(2) 行列ベクトル積の計算 行列(7,000 x 7,000) x ベクトル(1 列) の計算 CPU(20 スレッド)実行とシングル GPU 実行 CPU プログラム → 添付 2 GPU プログラム → 添付 3 Time Step 行列ベクトル積CPU(20スレッド) 行列ベクトル積(1GPU)
Total Watt (うち GPU Watt) Total Watt (うち GPU Watt)
10Sec 984W (50W) 744W (65W) 20Sec 972W (50W) 732W (65W) 30Sec 936W (50W) 744W (65W) 40Sec 912W (50W) 744W (65W) 50Sec 888W (50W) 744W (65W) 60Sec 864W (50W) 744W (65W) 70Sec 924W (50W) 744W (65W) 80Sec 792W (50W) 744W (65W) 90Sec 948W (50W) 744W (65W) 100Sec 972W (50W) 744W (65W) 110Sec 768W (55W) 736W (65W) 120Sec 802W (50W) 744W (65W) ・ ・ ・ ・ ・ ・ ・ ・ ・ ・ ・ ・ ・ ・ ・ ※CPU 版のコンパイラ pgi fortran バージョン:16.10 オプション:pgfortran -O3 -mp ※GPU コンパイラ nvcc バージョン:7.5 オプション:なし、ライブラリ cublas 使用
〇計算性能と消費電力
CPU 実行 GPU 実行
消費電力(12Ts の平均値) 約 897Watt 約 743Watt
計算性能 (GFlops) 約 220GFlops 約 1500GFlops
Watt 当たりの計算性能 0.25 GFlops/Watt 2.0 GFlops/Watt
(ご参考)京コンピュータ電力性能比 0.83GFlops/Watt
添付1 サンプルプログラム(mnist)
※ 55 行目で args.gpu 指定することで GPU を使うことになる。 1 #!/usr/bin/env python
2 from __future__ import print_function 3 import argparse
4
5 import chainer
6 import chainer.functions as F 7 import chainer.links as L 8 from chainer import training
9 from chainer.training import extensions 10
11
12 # Network definition 13 class MLP(chainer.Chain): 14
15 def __init__(self, n_units, n_out): 16 super(MLP, self).__init__(
17 # the size of the inputs to each layer will be inferred 18 l1=L.Linear(None, n_units), # n_in -> n_units 19 l2=L.Linear(None, n_units), # n_units -> n_units 20 l3=L.Linear(None, n_out), # n_units -> n_out 21 ) 22 23 def __call__(self, x): 24 h1 = F.relu(self.l1(x)) 25 h2 = F.relu(self.l2(h1)) 26 return self.l3(h2) 27 28 29 def main(): 30 parser = argparse.ArgumentParser(description='Chainer example: MNIST')
31 parser.add_argument('--batchsize', '-b', type=int, default=100, 32 help='Number of images in each mini-batch') 33 parser.add_argument('--epoch', '-e', type=int, default=20,
34 help='Number of sweeps over the dataset to train')
35 parser.add_argument('--gpu', '-g', type=int, default=-1,
36 help='GPU ID (negative value indicates CPU)')
37 parser.add_argument('--out', '-o', default='result',
38 help='Directory to output the result') 39 parser.add_argument('--resume', '-r', default='',
40 help='Resume the training from snapshot') 41 parser.add_argument('--unit', '-u', type=int, default=1000, 42 help='Number of units')
43 args = parser.parse_args() 44
45 print('GPU: {}'.format(args.gpu)) 46 print('# unit: {}'.format(args.unit))
47 print('# Minibatch-size: {}'.format(args.batchsize)) 48 print('# epoch: {}'.format(args.epoch))
49 print('') 50
51 # Set up a neural network to train
52 # Classifier reports softmax cross entropy loss and accuracy at every
53 # iteration, which will be used by the PrintReport extension below. 54 model = L.Classifier(MLP(args.unit, 10))
55 if args.gpu >= 0:
56 chainer.cuda.get_device(args.gpu).use() # Make a specified GPU current
57 model.to_gpu() # Copy the model to the GPU 58
59 # Setup an optimizer
60 optimizer = chainer.optimizers.Adam() 61 optimizer.setup(model)
62
63 # Load the MNIST dataset
64 train, test = chainer.datasets.get_mnist() 65
66 train_iter = chainer.iterators.SerialIterator(train, args.batchsize) 67 test_iter = chainer.iterators.SerialIterator(test, args.batchsize, 68 repeat=False, shuffle=False)
69
70 # Set up a trainer
71 updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
72 trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
73
74 # Evaluate the model with the test dataset for each epoch
75 trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu))
76
77 # Dump a computational graph from 'loss' variable at the first iteration
78 # The "main" refers to the target link of the "main" optimizer. 79 trainer.extend(extensions.dump_graph('main/loss'))
80
81 # Take a snapshot at each epoch
82 trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
83
84 # Write a log of evaluation statistics for each epoch 85 trainer.extend(extensions.LogReport())
86
87 # Save two plot images to the result dir 88 trainer.extend( 89 extensions.PlotReport(['main/loss', 'validation/main/loss'], 'epoch', 90 file_name='loss.png')) 91 trainer.extend( 92 extensions.PlotReport(['main/accuracy', 'validation/main/accuracy'], 93 'epoch', file_name='accuracy.png'))
94
95 # Print selected entries of the log to stdout
96 # Here "main" refers to the target link of the "main" optimizer again, and
97 # "validation" refers to the default name of the Evaluator extension.
98 # Entries other than 'epoch' are reported by the Classifier link, called by
99 # either the updater or the evaluator. 100 trainer.extend(extensions.PrintReport(
101 ['epoch', 'main/loss', 'validation/main/loss',
102 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) 103
104 # Print a progress bar to stdout
105 trainer.extend(extensions.ProgressBar()) 106
107 if args.resume:
108 # Resume from a snapshot
109 chainer.serializers.load_npz(args.resume, trainer) 110
111 # Run the training 112 trainer.run() 113
114 if __name__ == '__main__': 115 main()
添付2 行列ベクトル積(CPU マルチスレッド版) 27 行目から 33 行目がカーネル部 1 use omp_lib
2 implicit double precision(a-h,o-z) 3 allocatable a(:,:),b(:),c(:)
4 dimension toms(10000),tome(10000) 5 character*32 buff
6 !$OMP parallel
7 nth=omp_get_num_threads() 8 !$OMP end parallel
9 call getarg(1,buff) 10 read(buff,*) n 11 allocate(a(n,n),b(n),c(n)) 12 do i=1,n 13 do j=1,n 14 a(i,j)=1.d0/dble(i+j-1) 15 enddo 16 b(i)=1. 17 enddo 18 it=0 19 t0=elaptime() 20 100 continue 21 !$OMP critical 22 it=it+1 23 !$OMP end critical 24
25 if(it.gt.10000) goto 999 26 toms(it)=elaptime() 27 !$OMP parallel do reduction(+:c) 28 do j=1,n
29 do i=1,n
30 c(i)=c(i)+a(i,j)*b(j) 31 enddo
32 enddo
33 !$OMP end parallel do 34 tome(it)=elaptime()
35 !$OMP parallel do reduction(+:s) 36 do i=1,n
37 s=s+c(i)*c(i) 38 enddo
39 !$OMP end parallel do 40 s=dsqrt(s) 41 if(mod(it,1000).eq.0)then 42 write(6,*) it,s 43 c write(6,*) c 44 endif 45 b=c/s 46 c=0.0d0 47 goto 100 48 999 continue 49 t1=elaptime() 50 write(6,60) n,nth,t1-t0,1.d4*dble(2*n*n+4*n)/(t1-t0)*1.d-9 51 60 format("qaz",2i6,2f12.6) 52 write(60,61) tome-toms 53 write(6,*) "s =",s 54 61 format(1pd12.6) 55 stop 56 end
添付3:行列ベクトル積(GPU 版)
1 // dgemm CUDA test public domain 2 #include <stdio.h>
3 #include <stdlib.h> 4 #include <math.h> 5 #include "cublas.h" 6 //Matlab/Octave format
7 void printmat(int N, int M, double *A, int LDA) { 8 double mtmp;
9 printf("[ ");
10 for (int i = 0; i < N; i++) { 11 printf("[ "); 12 for (int j = 0; j < M; j++) { 13 mtmp = A[i + j * LDA]; 14 printf("%5.2e", mtmp); 15 if (j < M - 1) printf(", "); 16 } if (i < N - 1) printf("]; "); 17 else printf("] "); 18 } printf("]"); 19 }
20 double extern elaptime(void); 21 int main( int argc,char *argv[] ) 22 {
23 int n ; double alpha, beta; 24 int i,j,l,it;
25 int nt,nl; 26
27 cublasStatus statA, statB, statC; 28 scanf("%d %d ",&n,&nl);
29 nt=atoi(argv[1]);
30 double *devA[nl], *devB[nl], *devC[nl]; 31 double **A ;
32 double **B ; 33 double **C ;
34 double s1,s2,t1,t2,flop; 35 cudaSetDevice(2);
36 cublasInit();
37 A=(double**) malloc(sizeof(double**)*nl); 38 B=(double**) malloc(sizeof(double**)*nl); 39 C=(double**) malloc(sizeof(double**)*nl); 40 for(l=0 ; l<nl;l++){
41 A[l] = (double*) malloc(sizeof(double)*n*n); 42 B[l] = (double*) malloc(sizeof(double)*n*n); 43 C[l] = (double*) malloc(sizeof(double)*n*n); 44
45 statA = cublasAlloc (n*n, n*n, (void**)&devA[l]); 46 statB = cublasAlloc (2*n, n*n, (void**)&devB[l]); 47 statC = cublasAlloc (2*n, n*n, (void**)&devC[l]); 48 for(i=0 ; i<n ; i++){
49 for(j=0 ; j<n; j++){ 50 A[l][i*n+j]=1.e0/(double)(i+j+1+l); 51 } 52 B[l][i]=(double)(2*i+2); 53 B[l][i+n]=(double)(2*i+3); 54 } 55 } 56 printf("# start .¥n"); 57 alpha = 1.0; beta = 1.0; 58 t1=elaptime(); 59 float elapsed_time_ms=0.0f; 60 cudaEvent_t start, stop; 61 cudaEventCreate( &start ); 62 cudaEventCreate( &stop ); 63 cudaEventRecord( start, 0 ); 64 for(l=0; l< nl ; l++){
65 statA = cublasSetMatrix (n, n, n*n, A[l], n, devA[l], n); 66 statB = cublasSetMatrix (n, 2, 2*n, B[l], n, devB[l], n); 67 statC = cublasSetMatrix (n, 2, 2*n, C[l], n, devC[l], n); 68 }
69 for(it=0; it<nt ; it++){ 70 for(l=0; l< nl ; l++){
n);
72 s1=cublasDnrm2(n , devC[l], 1);
73 statC= cublasGetMatrix (n, 2, n*n,devC[l], n, C[l], n); 74 for(i=0;i<n;i++) { 75 B[l][i]=C[l][i]/s1; 76 B[l][i+n]=C[l][i+n]/s2; 77 C[l][i]=C[l][i+n]=0.0; 78 } 79 } 80 } 81 t2=elaptime(); 82 cudaEventRecord( stop, 0 ); 83 cudaEventSynchronize( stop );
84 cudaEventElapsedTime( &elapsed_time_ms, start, stop ); 85
86 printf("alpha = %5.3e¥n", alpha); 87 printf("beta = %5.3e¥n", beta); 88 printf(" sec= %lf ¥n",t2-t1);
89 flop=2.*(double)(2*n*n+4*n)*(double)(nt*nl); 90 printf(" sec2= %lf ¥n",elapsed_time_ms);
91 printf("s1 s2= %lf %lf ¥n", s1,s2); 92 printf(" n %d nl %d nt %d ¥n",n,nl,nt);
93 printf("#flop %lf ¥n GFlops %lf ¥n",flop,flop*1.e-9/(t2-t1)); 94 cublasFree (devA);
95 cublasFree (devB); 96 cublasFree (devC); 97 cublasShutdown();
98 delete[]C; delete[]B; delete[]A; 99 }