Commit 9eb2f7e6 authored by Peter Steinbach's avatar Peter Steinbach
Browse files

thrust and cuda* added to repo

parents
#include <vector>
#include <iostream>
#include <cstdint>
#include <cmath>
extern "C" {
#include "vector_sum.h"
}
int main(int argc, char *argv[])
{
std::size_t vector_size = (1<<20);
if(argc>1)
vector_size*=std::stoi(argv[1]);
std::cout << "vector sum: " << vector_size << " elements" << std::endl;
std::vector<float> host_a(vector_size,1.f);
std::vector<float> host_b(vector_size,2.f);
const float host_d = 42.f;
native_vector_sum(&host_a[0], &host_b[0], vector_size);
float max_error = 0.0f;
for (const float& item : host_a )
max_error = std::max(max_error, std::abs(item-3.0f));
std::cout << "Max error: " << max_error << std::endl;
return 0;
}
#include <vector>
#include <iostream>
#include <cstdint>
#include <cmath>
__global__ void vector_sum(std::size_t _size,
float* _a,
float* _b){
const std::size_t index = blockIdx.x*blockDim.x + threadIdx.x;
if (index < _size)
_a[index] = _a[index] + _b[index];
}
extern "C" void native_vector_sum(float* host_a,float* host_b, unsigned n_elements)
{
const std::size_t vector_size = n_elements;
std::cout << "vector sum: " << vector_size << " elements" << std::endl;
//gpu relevant code
float * device_a=nullptr, *device_b=nullptr;
const std::size_t vector_size_byte=vector_size*sizeof(float);
cudaMalloc(&device_a, vector_size_byte);
cudaMalloc(&device_b, vector_size_byte);
cudaMemcpy(device_a, &host_a[0], vector_size_byte,
cudaMemcpyHostToDevice);
cudaMemcpy(device_b, &host_b[0], vector_size_byte,
cudaMemcpyHostToDevice);
vector_sum<<<(vector_size+255)/256, 256>>>(vector_size,
device_a,
device_b);
cudaMemcpy(&host_a[0], device_a, vector_size_byte,
cudaMemcpyDeviceToHost);
cudaFree(device_a);
cudaFree(device_b);
return ;
}
#pragma once
extern "C" void native_vector_sum(float* host_a,float* host_b, unsigned n_elements);
#NVCC specific flags
CUDA_FLAGS += --std=c++11 -m64
#adapt to your architecture
ARCH_FLAGS ?= -gencode arch=compute_20,code=sm_20 -gencode arch=compute_20,code=sm_21 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
NVCC ?= $(shell which nvcc)
SRC_FILES=$(wildcard *.cpp)
DST_FILES=$(SRC_FILES:%.cpp=%)
all : $(DST_FILES)
lib%.so : %.cu
$(NVCC) $(CUDA_FLAGS) $(ARCH_FLAGS) $< --shared -Xcompiler '-fPIC' -o $@
% : %.cpp libvector_sum.so
$(CXX) -std=c++11 -o $@ -L. -I. -lvector_sum $<
#include <vector>
#include <iostream>
#include <cstdint>
#include <cmath>
__global__ void vector_sum(std::size_t _size,
float _scale,
float* _a,
float* _b){
const std::size_t index = blockIdx.x*blockDim.x + threadIdx.x;
if (index < _size)
_a[index] = _scale*_a[index] + _b[index];
}
int main(int argc, char *argv[])
{
std::size_t vector_size = (1<<20);
if(argc>1)
vector_size*=std::stoi(argv[1]);
std::cout << "vector sum: " << vector_size << " elements" << std::endl;
std::vector<float> host_a(vector_size,1.f);
std::vector<float> host_b(vector_size,2.f);
const float host_d = 42.f;
//gpu relevant code
float * device_a=nullptr, *device_b=nullptr;
const std::size_t vector_size_byte=vector_size*sizeof(float);
cudaMalloc(&device_a, vector_size_byte);
cudaMalloc(&device_b, vector_size_byte);
cudaMemcpy(device_a, &host_a[0], vector_size_byte,
cudaMemcpyHostToDevice);
cudaMemcpy(device_b, &host_b[0], vector_size_byte,
cudaMemcpyHostToDevice);
vector_sum<<<(vector_size+255)/256, 256>>>(vector_size,
host_d,
device_a,
device_b);
cudaMemcpy(&host_a[0], device_a, vector_size_byte,
cudaMemcpyDeviceToHost);
float max_error = 0.0f;
for (const float& item : host_a )
max_error = std::max(max_error, std::abs(item-44.0f));
std::cout << "Max error: " << max_error << std::endl;
cudaFree(device_a);
cudaFree(device_b);
return 0;
}
#NVCC specific flags
CUDA_FLAGS += --std=c++11 -m64
#adapt to your architecture
ARCH_FLAGS ?= -gencode arch=compute_20,code=sm_20 -gencode arch=compute_20,code=sm_21 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
NVCC ?= $(shell which nvcc)
SRC_FILES=$(wildcard *.cu)
DST_FILES=$(SRC_FILES:%.cu=%)
all : $(DST_FILES)
% : %.cu
$(NVCC) $(CUDA_FLAGS) $(ARCH_FLAGS) $< -o $@
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
//#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <iostream>
#include <cstdint>
struct saxpy_functor : public thrust::binary_function<float,float,float>
{
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__
float operator()(const float& x, const float& y) const {
return a * x + y;
}
};
int main(int argc, char *argv[])
{
std::size_t N = 1<<20;
thrust::host_vector<float> host_a(N,1.f);
thrust::host_vector<float> host_b(N,2.f);
const float scale = 42.f;
thrust::device_vector<float> dev_a = host_a;
thrust::device_vector<float> dev_b = host_b;
thrust::transform(dev_a.begin(), dev_a.end(), // input range #1
dev_b.begin(), // input range #2
dev_a.begin(), // output range
saxpy_functor(scale)); // placeholder expression
// thrust::transform(thrust::system::cuda::par,
// dev_a.begin(), dev_a.end(), // input range #1
// dev_b.begin(), // input range #2
// dev_a.begin(), // output range
// saxpy_functor(scale)); // placeholder expression
host_a = dev_a;
float max_error = 0.0f;
for (const float& item : host_a )
max_error = std::max(max_error, std::abs(item-44.0f));
std::cout << "Max error: " << max_error << std::endl;
return 0;
}
#NVCC specific flags
CUDA_FLAGS += --std=c++11 -m64
#adapt to your architecture
ARCH_FLAGS ?= -gencode arch=compute_20,code=sm_20 -gencode arch=compute_20,code=sm_21 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
NVCC ?= $(shell which nvcc)
SRC_FILES=$(wildcard *.cu)
DST_FILES=$(SRC_FILES:%.cu=%)
all : $(DST_FILES)
% : %.cu
$(NVCC) $(CUDA_FLAGS) $(ARCH_FLAGS) $< -o $@
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment