#include <stdio.h>
#include <cuda.h>
__global__ void vecmul(float *A, float* B, float *C, int size)
{
// Row and Column indexes:
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x+threadIdx.x;
// Are they bellow the maximum?
if (col < size && row < size) {
float result = 0;
for(int ix=0;ix<size;ix++) {
result += A[row*size+ix]*B[ix*size+col];
}
C[row*size+col] = result;
}
}
extern "C" {
void maxmul(float *A, float* B, float *C, int size) {
int total = size*size;
// Allocate device memory:
float* gpu_A;
float* gpu_B;
float* gpu_C;
int msize = total * sizeof(float);
cudaMalloc((void**)&gpu_A, msize);
cudaMemcpy(gpu_A,A,msize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&gpu_B, msize);
cudaMemcpy(gpu_B,B,msize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&gpu_C,msize);
// Blocks & grids:
dim3 blocks(size,size);
dim3 grid(1,1);
// Call the kernel:
vecmul<<<grid,blocks>>>(gpu_A,gpu_B,gpu_C,size);
// Get the result Matrix:
cudaMemcpy(C,gpu_C,msize,cudaMemcpyDeviceToHost);
//Free device matrices
cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);
}
}
package main
/*
void maxmul(float *A, float* B, float *C, int size);
#cgo LDFLAGS: -L. -L./ -lmaxmul
*/
import "C"
import "fmt"
func Maxmul(a []C.float, b []C.float, c []C.float, size int) {
C.maxmul(&a[0], &b[0], &c[0], C.int(size))
}
func main() {
//in := []C.float{1.23, 4.56}
//C.test(&in[0]) // C 1.230000 4.560000
a := []C.float{-1,2,4,0,5,3,6,2,1}
b := []C.float{3,0,2,3,4,5,4,7,2}
var c []C.float = make([]C.float, 9)
Maxmul(a,b,c,3)
fmt.Println(c)
}
nvcc --ptxas-options=-v --compiler-options '-fPIC' -o libmaxmul.so --shared maxmul.cu
go run maxmul.go
...
[19 36 16 27 41 31 28 15 24]