import numba as nb import numpy as np import time from numba import jit,cuda,float32 import time import cupy as cp # gpu = cp.ones((1024,512,4,4),dtype=cp.int32) # cpu =np.ones((1024,512,4,4),dtype=np.int32) # # # ctime1 = time.time() # # for c in range(1024): # cpu = np.add(cpu,cpu) # # ctime2 = time.time() # # ctotal = ctime2 - ctime1 # print('纯cpu计算时间:', ctotal) # # # 纯cupy的gpu测试: # gtime1 = time.time() #GPU下测试系统时钟开始计时 # for g in range(1024): # gpu = cp.add(gpu,gpu) # 自带的加法函数 # gtime2 = time.time() # gtotal = gtime2 - gtime1 # print('纯gpu计算时间:', gtotal) # # # # gpu和cpu混合编程: # ggtime1 = time.time() # for g in range(1024): # gpu = gpu + gpu # 手工加法:+ 默认回到cpu计算!!! # ggtime2 = time.time() # ggtotal = ggtime2 - ggtime1 # print('混合的计算时间:', ggtotal) TPB = 16 @cuda.jit def matmul_gpu(A,B,C): pass @nb.jit(nopython =True) def matmul_cpu(A,B,C): for y in range(B.shape[1]): for x in range(B.shape[0]): tmp = 0. for k in range(A.shape[1]): tmp += A[x,k] * B[k,y] C[x,y] = tmp @cuda.jit def matmul_shared_memary(A,B,C): pass A = np.full((TPB * 50,TPB * 50),3,dtype=np.float) B = np.full((TPB * 50,TPB * 50),4,dtype=np.float) C_cpu = np.full((A.shape[0],B.shape[1]),0,dtype=np.float) print("start processing in CPU") start_cpu = time.time() matmul_cpu(A,B,C_cpu) end_time =time.time() - start_cpu print("cpu time".format(str(round(end_time)))) #start in GPU A_global_mem = cuda.to_device(A) B_global_mem = cuda.to_device(B) C_global_mem = cuda.device_array((A.shape[0],B.shape[1])) C_shared_mem = cuda.device_array((A.shape[0],B.shape[1]))