import  numba as nb
import numpy as np
import time
from numba import jit,cuda,float32
import time

import cupy as cp

# gpu = cp.ones((1024,512,4,4),dtype=cp.int32)
# cpu  =np.ones((1024,512,4,4),dtype=np.int32)
#
#
# ctime1 = time.time()
#
# for c in range(1024):
#     cpu = np.add(cpu,cpu)
#
# ctime2 = time.time()
#
# ctotal = ctime2 - ctime1
# print('纯cpu计算时间:', ctotal)
#
# # 纯cupy的gpu测试:
# gtime1 = time.time() #GPU下测试系统时钟开始计时
# for g in range(1024):
#     gpu = cp.add(gpu,gpu)   # 自带的加法函数
# gtime2 = time.time()
# gtotal = gtime2 - gtime1
# print('纯gpu计算时间:', gtotal)
#
#
# # gpu和cpu混合编程:
# ggtime1 = time.time()
# for g in range(1024):
#     gpu = gpu + gpu         # 手工加法:+ 默认回到cpu计算!!!
# ggtime2 = time.time()
# ggtotal = ggtime2 - ggtime1
# print('混合的计算时间:', ggtotal)
TPB = 16

@cuda.jit
def matmul_gpu(A,B,C):
    pass
@nb.jit(nopython =True)
def matmul_cpu(A,B,C):
    for y in range(B.shape[1]):
        for x in range(B.shape[0]):
            tmp = 0.
            for k in range(A.shape[1]):
                tmp += A[x,k] * B[k,y]
            C[x,y] = tmp
@cuda.jit
def matmul_shared_memary(A,B,C):
    pass

A = np.full((TPB * 50,TPB * 50),3,dtype=np.float)
B = np.full((TPB * 50,TPB * 50),4,dtype=np.float)
C_cpu = np.full((A.shape[0],B.shape[1]),0,dtype=np.float)

print("start  processing in CPU")
start_cpu = time.time()
matmul_cpu(A,B,C_cpu)
end_time =time.time() - start_cpu
print("cpu time".format(str(round(end_time))))

#start in GPU
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.device_array((A.shape[0],B.shape[1]))

C_shared_mem = cuda.device_array((A.shape[0],B.shape[1]))