Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#pragma once
#include <exception>
#include <cstdarg>
#include <string>
#include "gpu_types.hpp"
namespace gpu {
inline std::string stringprintf(const char* format, ...) {
va_list args;
va_start(args, format);
int len = vsnprintf(0, 0, format, args);
va_end(args);
// allocate space.
std::string text;
text.resize(len);
va_start(args, format);
vsnprintf(&text[0], len + 1, format, args);
va_end(args);
return text;
}
enum memory_space_t {
memory_space_device = 0,
memory_space_host = 1
};
inline std::string device_prop_string(cudaDeviceProp prop) {
int ordinal;
cudaGetDevice(&ordinal);
size_t freeMem, totalMem;
cudaError_t result = cudaMemGetInfo(&freeMem, &totalMem);
if(cudaSuccess != result) throw cuda_exception_t(result);
double memBandwidth = (prop.memoryClockRate * 1000.0) *
(prop.memoryBusWidth / 8 * 2) / 1.0e9;
std::string s = stringprintf(
"%s : %8.3lf Mhz (Ordinal %d)\n"
"%d SMs enabled. Compute Capability sm_%d%d\n"
"FreeMem: %6dMB TotalMem: %6dMB %2d-bit pointers.\n"
"Mem Clock: %8.3lf Mhz x %d bits (%5.1lf GB/s)\n"
"ECC %s\n\n",
prop.name, prop.clockRate / 1000.0, ordinal,
prop.multiProcessorCount, prop.major, prop.minor,
(int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)), 8 * sizeof(int*),
prop.memoryClockRate / 1000.0, prop.memoryBusWidth, memBandwidth,
prop.ECCEnabled ? "Enabled" : "Disabled");
return s;
}
////////////////////////////////////////////////////////////////////////////////
// context_t
// Derive context_t to add support for streams and a custom allocator.
struct context_t {
context_t() = default;
// Disable copy ctor and assignment operator. We don't want to let the
// user copy only a slice.
context_t(const context_t& rhs) = delete;
context_t& operator=(const context_t& rhs) = delete;
virtual const cudaDeviceProp& props() const = 0;
virtual int ptx_version() const = 0;
virtual cudaStream_t stream() = 0;
// Alloc GPU memory.
virtual void* alloc(size_t size, memory_space_t space) = 0;
virtual void free(void* p, memory_space_t space) = 0;
// cudaStreamSynchronize or cudaDeviceSynchronize for stream 0.
virtual void synchronize() = 0;
virtual cudaEvent_t event() = 0;
virtual void timer_begin() = 0;
virtual double timer_end() = 0;
};
// Dummy kernel for retrieving PTX version.
template<int no_arg>
__global__ void empty_f() { }
}