Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
steinbac
20150118-cuda-intro
Commits
9eb2f7e6
Commit
9eb2f7e6
authored
Jan 18, 2016
by
Peter Steinbach
Browse files
thrust and cuda* added to repo
parents
Changes
8
Hide whitespace changes
Inline
Side-by-side
cuda-java/.helpmeout/call_vector_sum.cpp
0 → 100644
View file @
9eb2f7e6
#include <vector>
#include <iostream>
#include <cstdint>
#include <cmath>
extern
"C"
{
#include "vector_sum.h"
}
int
main
(
int
argc
,
char
*
argv
[])
{
std
::
size_t
vector_size
=
(
1
<<
20
);
if
(
argc
>
1
)
vector_size
*=
std
::
stoi
(
argv
[
1
]);
std
::
cout
<<
"vector sum: "
<<
vector_size
<<
" elements"
<<
std
::
endl
;
std
::
vector
<
float
>
host_a
(
vector_size
,
1.
f
);
std
::
vector
<
float
>
host_b
(
vector_size
,
2.
f
);
const
float
host_d
=
42.
f
;
native_vector_sum
(
&
host_a
[
0
],
&
host_b
[
0
],
vector_size
);
float
max_error
=
0.0
f
;
for
(
const
float
&
item
:
host_a
)
max_error
=
std
::
max
(
max_error
,
std
::
abs
(
item
-
3.0
f
));
std
::
cout
<<
"Max error: "
<<
max_error
<<
std
::
endl
;
return
0
;
}
cuda-java/.helpmeout/vector_sum.cu
0 → 100644
View file @
9eb2f7e6
#include <vector>
#include <iostream>
#include <cstdint>
#include <cmath>
__global__
void
vector_sum
(
std
::
size_t
_size
,
float
*
_a
,
float
*
_b
){
const
std
::
size_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
_size
)
_a
[
index
]
=
_a
[
index
]
+
_b
[
index
];
}
extern
"C"
void
native_vector_sum
(
float
*
host_a
,
float
*
host_b
,
unsigned
n_elements
)
{
const
std
::
size_t
vector_size
=
n_elements
;
std
::
cout
<<
"vector sum: "
<<
vector_size
<<
" elements"
<<
std
::
endl
;
//gpu relevant code
float
*
device_a
=
nullptr
,
*
device_b
=
nullptr
;
const
std
::
size_t
vector_size_byte
=
vector_size
*
sizeof
(
float
);
cudaMalloc
(
&
device_a
,
vector_size_byte
);
cudaMalloc
(
&
device_b
,
vector_size_byte
);
cudaMemcpy
(
device_a
,
&
host_a
[
0
],
vector_size_byte
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
device_b
,
&
host_b
[
0
],
vector_size_byte
,
cudaMemcpyHostToDevice
);
vector_sum
<<<
(
vector_size
+
255
)
/
256
,
256
>>>
(
vector_size
,
device_a
,
device_b
);
cudaMemcpy
(
&
host_a
[
0
],
device_a
,
vector_size_byte
,
cudaMemcpyDeviceToHost
);
cudaFree
(
device_a
);
cudaFree
(
device_b
);
return
;
}
cuda-java/.helpmeout/vector_sum.h
0 → 100644
View file @
9eb2f7e6
#pragma once
extern
"C"
void
native_vector_sum
(
float
*
host_a
,
float
*
host_b
,
unsigned
n_elements
);
cuda-java/Makefile
0 → 100644
View file @
9eb2f7e6
#NVCC specific flags
CUDA_FLAGS
+=
--std
=
c++11
-m64
#adapt to your architecture
ARCH_FLAGS
?=
-gencode
arch
=
compute_20,code
=
sm_20
-gencode
arch
=
compute_20,code
=
sm_21
-gencode
arch
=
compute_30,code
=
sm_30
-gencode
arch
=
compute_35,code
=
sm_35
NVCC
?=
$(
shell
which nvcc
)
SRC_FILES
=
$(
wildcard
*
.cpp
)
DST_FILES
=
$(SRC_FILES:%.cpp=%)
all
:
$(DST_FILES)
lib%.so
:
%.cu
$(NVCC)
$(CUDA_FLAGS)
$(ARCH_FLAGS)
$<
--shared
-Xcompiler
'-fPIC'
-o
$@
%
:
%.cpp libvector_sum.so
$(CXX)
-std
=
c++11
-o
$@
-L
.
-I
.
-lvector_sum
$<
cuda/.helpmeout/vector_sum.cu
0 → 100644
View file @
9eb2f7e6
#include <vector>
#include <iostream>
#include <cstdint>
#include <cmath>
__global__
void
vector_sum
(
std
::
size_t
_size
,
float
_scale
,
float
*
_a
,
float
*
_b
){
const
std
::
size_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
_size
)
_a
[
index
]
=
_scale
*
_a
[
index
]
+
_b
[
index
];
}
int
main
(
int
argc
,
char
*
argv
[])
{
std
::
size_t
vector_size
=
(
1
<<
20
);
if
(
argc
>
1
)
vector_size
*=
std
::
stoi
(
argv
[
1
]);
std
::
cout
<<
"vector sum: "
<<
vector_size
<<
" elements"
<<
std
::
endl
;
std
::
vector
<
float
>
host_a
(
vector_size
,
1.
f
);
std
::
vector
<
float
>
host_b
(
vector_size
,
2.
f
);
const
float
host_d
=
42.
f
;
//gpu relevant code
float
*
device_a
=
nullptr
,
*
device_b
=
nullptr
;
const
std
::
size_t
vector_size_byte
=
vector_size
*
sizeof
(
float
);
cudaMalloc
(
&
device_a
,
vector_size_byte
);
cudaMalloc
(
&
device_b
,
vector_size_byte
);
cudaMemcpy
(
device_a
,
&
host_a
[
0
],
vector_size_byte
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
device_b
,
&
host_b
[
0
],
vector_size_byte
,
cudaMemcpyHostToDevice
);
vector_sum
<<<
(
vector_size
+
255
)
/
256
,
256
>>>
(
vector_size
,
host_d
,
device_a
,
device_b
);
cudaMemcpy
(
&
host_a
[
0
],
device_a
,
vector_size_byte
,
cudaMemcpyDeviceToHost
);
float
max_error
=
0.0
f
;
for
(
const
float
&
item
:
host_a
)
max_error
=
std
::
max
(
max_error
,
std
::
abs
(
item
-
44.0
f
));
std
::
cout
<<
"Max error: "
<<
max_error
<<
std
::
endl
;
cudaFree
(
device_a
);
cudaFree
(
device_b
);
return
0
;
}
cuda/Makefile
0 → 100644
View file @
9eb2f7e6
#NVCC specific flags
CUDA_FLAGS
+=
--std
=
c++11
-m64
#adapt to your architecture
ARCH_FLAGS
?=
-gencode
arch
=
compute_20,code
=
sm_20
-gencode
arch
=
compute_20,code
=
sm_21
-gencode
arch
=
compute_30,code
=
sm_30
-gencode
arch
=
compute_35,code
=
sm_35
NVCC
?=
$(
shell
which nvcc
)
SRC_FILES
=
$(
wildcard
*
.cu
)
DST_FILES
=
$(SRC_FILES:%.cu=%)
all
:
$(DST_FILES)
%
:
%.cu
$(NVCC)
$(CUDA_FLAGS)
$(ARCH_FLAGS)
$<
-o
$@
thrust/.helpmeout/vector_sum.cu
0 → 100644
View file @
9eb2f7e6
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
//#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <iostream>
#include <cstdint>
struct
saxpy_functor
:
public
thrust
::
binary_function
<
float
,
float
,
float
>
{
const
float
a
;
saxpy_functor
(
float
_a
)
:
a
(
_a
)
{}
__host__
__device__
float
operator
()(
const
float
&
x
,
const
float
&
y
)
const
{
return
a
*
x
+
y
;
}
};
int
main
(
int
argc
,
char
*
argv
[])
{
std
::
size_t
N
=
1
<<
20
;
thrust
::
host_vector
<
float
>
host_a
(
N
,
1.
f
);
thrust
::
host_vector
<
float
>
host_b
(
N
,
2.
f
);
const
float
scale
=
42.
f
;
thrust
::
device_vector
<
float
>
dev_a
=
host_a
;
thrust
::
device_vector
<
float
>
dev_b
=
host_b
;
thrust
::
transform
(
dev_a
.
begin
(),
dev_a
.
end
(),
// input range #1
dev_b
.
begin
(),
// input range #2
dev_a
.
begin
(),
// output range
saxpy_functor
(
scale
));
// placeholder expression
// thrust::transform(thrust::system::cuda::par,
// dev_a.begin(), dev_a.end(), // input range #1
// dev_b.begin(), // input range #2
// dev_a.begin(), // output range
// saxpy_functor(scale)); // placeholder expression
host_a
=
dev_a
;
float
max_error
=
0.0
f
;
for
(
const
float
&
item
:
host_a
)
max_error
=
std
::
max
(
max_error
,
std
::
abs
(
item
-
44.0
f
));
std
::
cout
<<
"Max error: "
<<
max_error
<<
std
::
endl
;
return
0
;
}
thrust/Makefile
0 → 100644
View file @
9eb2f7e6
#NVCC specific flags
CUDA_FLAGS
+=
--std
=
c++11
-m64
#adapt to your architecture
ARCH_FLAGS
?=
-gencode
arch
=
compute_20,code
=
sm_20
-gencode
arch
=
compute_20,code
=
sm_21
-gencode
arch
=
compute_30,code
=
sm_30
-gencode
arch
=
compute_35,code
=
sm_35
NVCC
?=
$(
shell
which nvcc
)
SRC_FILES
=
$(
wildcard
*
.cu
)
DST_FILES
=
$(SRC_FILES:%.cu=%)
all
:
$(DST_FILES)
%
:
%.cu
$(NVCC)
$(CUDA_FLAGS)
$(ARCH_FLAGS)
$<
-o
$@
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment