Commit aeae8cf7 authored by steinbac's avatar steinbac
Browse files

finished java part, needs debugggin

parent 6b17df4f
#include <vector>
#include <iostream>
#include <cstdint>
#include <cmath>
extern "C" {
#include "vector_sum.h"
}
int main(int argc, char *argv[])
{
std::size_t vector_size = (1<<20);
if(argc>1)
vector_size*=std::stoi(argv[1]);
std::cout << "vector sum: " << vector_size << " elements" << std::endl;
std::vector<float> host_a(vector_size,1.f);
std::vector<float> host_b(vector_size,2.f);
const float host_d = 42.f;
native_vector_sum(&host_a[0], &host_b[0], vector_size);
float max_error = 0.0f;
for (const float& item : host_a )
max_error = std::max(max_error, std::abs(item-3.0f));
std::cout << "Max error: " << max_error << std::endl;
return 0;
}
......@@ -16,3 +16,6 @@ lib%.so : %.cu
% : %.cpp libvector_sum.so
$(CXX) -std=c++11 -o $@ -L. -I. -lvector_sum $<
clean :
rm *so $(DST_FILES)
\ No newline at end of file
......@@ -17,7 +17,7 @@ public class VectorSum {
void native_vector_sum(Pointer host_a,Pointer host_b, unsigned n_elements);
}
public static void main(String[] args) {
CLibrary cuda_lib = (CLibrary)Native.loadLibrary("libvectorsum", CLibrary.class);
......@@ -33,7 +33,7 @@ public class VectorSum {
vec_b.setFloat(dloop * Native.getNativeSize(Float.TYPE), 2);
}
// call the C function
double add = clib.native_vector_sum(vec_a,vec_b, 100);
cuda_lib.native_vector_sum(vec_a,vec_b, 100);
//check the result
Double sum = 0;
......
......@@ -21,9 +21,9 @@ int main(int argc, char *argv[])
std::cout << "vector sum: " << vector_size << " elements" << std::endl;
std::vector<float> host_a(vector_size,1.f);
std::vector<float> host_a(vector_size,42.f);
std::vector<float> host_b(vector_size,2.f);
const float host_d = 42.f;
const float host_scale = 1.f;
//gpu relevant code
float * device_a=nullptr, *device_b=nullptr;
......@@ -36,10 +36,12 @@ int main(int argc, char *argv[])
cudaMemcpy(device_b, &host_b[0], vector_size_byte,
cudaMemcpyHostToDevice);
vector_sum<<<(vector_size+255)/256, 256>>>(vector_size,
host_d,
device_a,
device_b);
unsigned threads =32;
unsigned blocks = (vector_size + threads - 1)/threads;
vector_sum<<<blocks,threads>>>(vector_size,
host_scale,
device_a,
device_b);
cudaMemcpy(&host_a[0], device_a, vector_size_byte,
cudaMemcpyDeviceToHost);
......
......@@ -3,7 +3,7 @@
CUDA_FLAGS += --std=c++11 -m64
#adapt to your architecture
ARCH_FLAGS ?= -gencode arch=compute_20,code=sm_20 -gencode arch=compute_20,code=sm_21 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
ARCH_FLAGS ?= -g -G -gencode arch=compute_20,code=sm_20 -gencode arch=compute_20,code=sm_21 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
NVCC ?= $(shell which nvcc)
SRC_FILES=$(wildcard *.cu)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment