11 #include <Base/Common/BIASpragma.hh>
12 #include <OpenCLFramework/Algorithm/clfRadixSort.hh>
14 #define TRANSPOSE 0 // transpose the initial vector (faster memory access)
19 #define _RADIX (1 << 5) // radix = 2^5
20 #define _PASS (30/5) // number of needed passes to sort the list
21 #define _HISTOSIZE (16 * 16 * _RADIX ) // size of the histogram
23 #define _MAXINT (1 << (30-1))
29 clfRadixSort::clfRadixSort(
clfContext *ctx,
bool sharedGL,
unsigned int device)
35 assert((1<<23) % (16 * 16) == 0);
36 assert( (16 * 16 * _RADIX) % 512 == 0);
37 assert(pow(
float(2),(
int) log2(
float(16))) == 16);
38 assert(pow(
float(2),(
int) log2(
float(16))) == 16);
94 int maxmemcache=max(512,16 * 16 * _RADIX / 512);
102 for(
int i = 0; i < (1<<23); i++){
109 for (
int i=0;i<num;i++) {
110 h_Keys[i] = (int)(data[i]*scale);
113 for(
int i = 0; i < num; i++){
129 assert(nn <= (1<<23));
134 int reste=
nkeys % (16 * 16);
136 unsigned int pad[16 * 16];
137 for(
int ii=0;ii<16 * 16;ii++){
138 pad[ii]=_MAXINT-(
unsigned int)1;
155 if (nbrow%tilesize != 0) tilesize=1;
156 if (nbcol%tilesize != 0) tilesize=1;
159 cout <<
"Warning, small list, avoiding cache..."<<endl;
172 size_t global_work_size[2];
173 size_t local_work_size[2];
175 assert(nbrow%tilesize == 0);
176 assert(nbcol%tilesize == 0);
178 global_work_size[0]=nbrow/tilesize;
179 global_work_size[1]=nbcol;
181 local_work_size[0]=1;
182 local_work_size[1]=tilesize;
215 for(
unsigned int pass=0;pass<_PASS;pass++){
230 string kname =
"applypermutation";
231 if (hightolow) kname =
"applypermutationHighToLow";
243 cout <<
"Get the data from the GPU"<<endl;
247 cout <<
"Test order"<<endl;
250 for(
unsigned int i=0;i<
nkeys-1;i++){
252 cout <<
"erreur tri "<< i<<
" "<<
h_Keys[i]<<
" ,"<<i+1<<
" "<<
h_Keys[i+1]<<endl;
257 cout <<
"test OK !"<<endl;
301 size_t nblocitems=16;
302 size_t nbitems=16*16;
304 assert(_RADIX == pow(
float(2),
int(5)));
323 size_t nbitems=_RADIX* 16*16 / 2;
326 size_t nblocitems= nbitems/512 ;
349 nbitems = _RADIX* 16*16/2;
350 nblocitems=nbitems/512;
359 size_t nblocitems=16;
360 size_t nbitems=16*16;
369 assert(_RADIX == pow(
float(2),
int(5)));
unsigned int h_Histograms[5 *16 *16]
clfBuffer * CreateBuffer()
create buffer object
std::string AddSource(std::string filename)
adds source code from a file
unsigned int h_Permut[(1<< 23)]
void ApplyPermutation(clfBuffer *in, clfBuffer *out, int elemsize, int numElems, bool hightolow=false)
void KernelSetLocalArgument(std::string kernelname, unsigned int argnumber, int size)
void Histogram(unsigned int pass)
void SetData(int num, float *data, int scale=1)
void Reorder(unsigned int pass)
void Build(int deviceNr=0, std::string options="")
builds the sources added by AddSource and AddSourceFromString
const std::string & GetDetailedString() const
detailed combination of all info available
void ReadFromBuffer(void *data, unsigned int offset=0, unsigned int size=0)
read from buffer object to host memory
void Transpose(int nbrow, int nbcol)
clf Exception wrapper, is thrown in case of most clf errors
void Finish()
force finishing the command queue
void Allocate(unsigned int bufsize, bool readonly=false, bool writeonly=false, void *hostptr=NULL, bool copy=false)
Allocation of a memory buffer A memory buffer can be created on device or host, it can be initialized...
unsigned int h_globsum[512]
void KernelSetArgument(std::string kernelname, unsigned int argnumber, clfBuffer &buffer)
set kernel argument
void AddKernel(std::string kernelname)
adds a kernel to the program.
unsigned int nkeys_rounded
void RunOn2DRange(clfProgram &program, std::string kernelname, unsigned int globalrangeX, unsigned int globalrangeY, unsigned int localrangeX=0, unsigned int localrangeY=0)
run a kernel on a 2D memory range
unsigned int * GetPermutation(unsigned int num=0)
void CopyBuffer(clfBuffer &outputbuffer, unsigned int srcoffset=0, unsigned int dstoffset=0, unsigned int size=0)
copy from one buffer to another
void WriteToBuffer(const void *data, unsigned int offset=0, unsigned int size=0)
write from host memory to buffer object
unsigned int h_initialPermut[(1<< 23)]
void RunOn1DRange(clfProgram &program, std::string kernelname, unsigned int globalrange, unsigned int localrange=0)
run a kernel on a 1D memory range
unsigned int h_Keys[(1<< 23)]
int DivUp(const int mod, int val)