/*
 * ofp_context.hpp
 *
 *  Created on: Nov 15, 2018
 *      Author: i-bird
 */

#ifndef OFP_CONTEXT_HXX_
#define OFP_CONTEXT_HXX_

#include <iostream>

#ifdef CUDA_ON_CPU

namespace gpu
{
	enum gpu_context_opt
	{
		no_print_props,//!< no_print_props
		print_props,   //!< print_props
		dummy          //!< dummy
	};

	struct context_t {};

	class ofp_context_t : public context_t
	{
		protected:

			std::string _props;

			openfpm::vector<aggregate<unsigned char>> tmem;

			template<int no_arg = 0>
			void init(int dev_num, gpu_context_opt opt)
			{}

		public:

			/*! \brief gpu context constructor
				*
				* \param opt options for this gpu context
				*
				*/
			ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, int stream_ = 0)
			{}

			~ofp_context_t()
			{}

			virtual const std::string& props() const
			{
				return _props;
			}

			virtual int ptx_version() const
			{
				return 0;
			}

			virtual int stream() 
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
				return 0; 
			}

			// Alloc GPU memory.
			virtual void* alloc(size_t size, int space)
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
				return NULL;
			}

			virtual void free(void* p, int space)
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
			}

			virtual void synchronize()
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
			}

			virtual int event()
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
				return 0;
			}

			virtual void timer_begin()
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
			}

			virtual double timer_end()
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
				return 0.0;
			}

			virtual int getDevice()
			{
				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
				return 0;
			}
	};

}

#else
	#ifdef CUDA_GPU

		#include "util/gpu_context.hpp"

		namespace gpu
		{
			enum gpu_context_opt
			{
				no_print_props,//!< no_print_props
				print_props,   //!< print_props
				dummy          //!< dummy
			};

			////////////////////////////////////////////////////////////////////////////////
			// ofp_context_t is a trivial implementation of context_t. Users can
			// derive this type to provide a custom allocator.

			class ofp_context_t : public context_t
			{
				protected:
					cudaDeviceProp _props;
					int _ptx_version;
					cudaStream_t _stream;

					cudaEvent_t _timer[2];
					cudaEvent_t _event;

					openfpm::vector_gpu<aggregate<unsigned char>> tmem;
					openfpm::vector_gpu<aggregate<unsigned char>> tmem2;
					openfpm::vector_gpu<aggregate<unsigned char>> tmem3;

					// Making this a template argument means we won't generate an instance
					// of empty_f for each translation unit.
					template<int no_arg = 0>
					void init(int dev_num, gpu_context_opt opt)
					{
						cudaFuncAttributes attr;
						#ifdef __NVCC__
						cudaError_t result = cudaFuncGetAttributes(&attr, (void *)empty_f<0>);
						if(cudaSuccess != result) throw cuda_exception_t(result);
						_ptx_version = attr.ptxVersion;
						#else
						_ptx_version = 60;
						//std::cout << __FILE__ << ":" << __LINE__ << " Warning initialization of GPU context has been done from a standard Cpp file, rather than a CUDA or HIP file" << std::endl;
						#endif

						int num_dev;
						cudaGetDeviceCount(&num_dev);

						if (num_dev == 0) {return;}

						if (opt != gpu_context_opt::dummy)
						{
							cudaSetDevice(dev_num % num_dev);
						}

						int ord;
						cudaGetDevice(&ord);
						cudaGetDeviceProperties(&_props, ord);

						cudaEventCreate(&_timer[0]);
						cudaEventCreate(&_timer[1]);
						cudaEventCreate(&_event);
					}

				public:


					/*! \brief gpu context constructor
					*
					* \param opt options for this gpu context
					*
					*/
					ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0)
					:context_t(), _stream(stream_)
					{
						init(dev_num,opt);
						if(opt == gpu_context_opt::print_props)
						{
							printf("%s\n", device_prop_string(_props).c_str());
						}
					}

					~ofp_context_t()
					{
						cudaEventDestroy(_timer[0]);
						cudaEventDestroy(_timer[1]);
						cudaEventDestroy(_event);
					}

					virtual const cudaDeviceProp& props() const { return _props; }
					virtual int ptx_version() const { return _ptx_version; }
					virtual cudaStream_t stream() { return _stream; }

					// Alloc GPU memory.
					virtual void* alloc(size_t size, memory_space_t space)
					{
						void* p = nullptr;
						if(size)
						{
							cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size);
							if(cudaSuccess != result) throw cuda_exception_t(result);
						}
						return p;
					}

					virtual void free(void* p, memory_space_t space)
					{
						if(p)
						{
							cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p);
							if(cudaSuccess != result) throw cuda_exception_t(result);
						}
					}

					virtual void synchronize()
					{
						cudaError_t result = _stream ?
						cudaStreamSynchronize(_stream) :
						cudaDeviceSynchronize();
						if(cudaSuccess != result) throw cuda_exception_t(result);
					}

					virtual cudaEvent_t event()
					{
						return _event;
					}

					virtual void timer_begin()
					{
						cudaEventRecord(_timer[0], _stream);
					}

					virtual double timer_end()
					{
						cudaEventRecord(_timer[1], _stream);
						cudaEventSynchronize(_timer[1]);
						float ms;
						cudaEventElapsedTime(&ms, _timer[0], _timer[1]);
						return ms / 1.0e3;
					}

					virtual int getDevice()
					{
						int dev = 0;

						cudaGetDevice(&dev);

						return dev;
					}

					virtual int getNDevice()
					{
						int num_dev;
						cudaGetDeviceCount(&num_dev);

						return num_dev;
					}

					openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB()
					{
						return tmem;
					}

					openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB2()
					{
						return tmem2;
					}

					openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB3()
					{
						return tmem3;
					}
			};

		}

	#else

		namespace gpu
		{

			enum gpu_context_opt
			{
				no_print_props,//!< no_print_props
				print_props,   //!< print_props
				dummy          //!< dummy
			};

			// Stub class for modern gpu

			struct ofp_context_t
			{
				ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0)
				{}
			};
		}

	#endif

#endif


#endif /* OFP_CONTEXT_HXX_ */