Basic Image AlgorithmS Library  2.8.0
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
clfRadixSort.cpp
1 // C++ class for sorting integer list in OpenCL
2 // copyright Philippe Helluy, University de Strasbourg, France, 2011, helluy@math.unistra.fr
3 // licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
4 // if you find this software usefull you can cite the following work in your reports or articles:
5 // Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, HAL 2011.
6 
7 // members of the class clfRadixSort
8 // see a description in the hpp...
9 
10 #include <fstream>
11 #include <Base/Common/BIASpragma.hh>
12 #include <OpenCLFramework/Algorithm/clfRadixSort.hh>
13 
14 #define TRANSPOSE 0 // transpose the initial vector (faster memory access)
15 ////////////////////////////////////////////////////////
16 
17 
18 // the following parameters are computed from the previous
19 #define _RADIX (1 << 5) // radix = 2^5
20 #define _PASS (30/5) // number of needed passes to sort the list
21 #define _HISTOSIZE (16 * 16 * _RADIX ) // size of the histogram
22 // maximal value of integers for the sort to be correct
23 #define _MAXINT (1 << (30-1))
24 
25 
26 using namespace std;
27 using namespace BIAS;
28 
29 clfRadixSort::clfRadixSort(clfContext *ctx, bool sharedGL, unsigned int device)
30 : clfAlgorithm(ctx, sharedGL, device), nkeys((1<<23))
31 {
33  // check some conditions
34  assert(30 % 5 == 0);
35  assert((1<<23) % (16 * 16) == 0);
36  assert( (16 * 16 * _RADIX) % 512 == 0);
37  assert(pow(float(2),(int) log2(float(16))) == 16);
38  assert(pow(float(2),(int) log2(float(16))) == 16);
39 
40  // init the timers
41  histo_time=0;
42  scan_time=0;
43  reorder_time=0;
45 
46  try {
47  programCL_->AddSource("/OpenCLFramework/Algorithm/cl/radixSort.cl");
48  programCL_->Build();
49 
50  programCL_->AddKernel("histogram");
51  programCL_->AddKernel("scanhistograms");
52  programCL_->AddKernel("pastehistograms");
53  programCL_->AddKernel("reorder");
54  programCL_->AddKernel("transpose");
55  programCL_->AddKernel("applypermutationHighToLow");
56  programCL_->AddKernel("applypermutation");
57 
59  d_inKeys->Allocate(sizeof(unsigned int)* (1<<23), false,false);
60 
62  d_outKeys->Allocate(sizeof(unsigned int)* (1<<23), false,false);
63 
65  d_inPermut->Allocate(sizeof(unsigned int)* (1<<23), false,false);
66 
68  d_outPermut->Allocate(sizeof(unsigned int)* (1<<23), false,false);
69 
70  // allocate the histogram on the GPU
72  d_Histograms->Allocate(sizeof(unsigned int)* _RADIX * 16 * 16, false, false);
73 
75  d_globsum->Allocate(sizeof(unsigned int)* 512, false, false);
76  // allocate the auxiliary histogram on GPU
77 
79  d_temp->Allocate(sizeof(unsigned int)* 512, false, false);
80  // temporary vector when the sum is not needed
81 
82  // we set here the fixed arguments of the OpenCL kernels
83  // the changing arguments are modified elsewhere in the class
84 
85  programCL_->KernelSetArgument("histogram", 1, *d_Histograms);
86  programCL_->KernelSetLocalArgument("histogram", 3, sizeof(unsigned int)*_RADIX*16);
87 
88  programCL_->KernelSetArgument("pastehistograms", 0, *d_Histograms);
89  programCL_->KernelSetArgument("pastehistograms", 1, *d_globsum);
90 
91  programCL_->KernelSetArgument("reorder", 2, *d_Histograms);
92  programCL_->KernelSetLocalArgument("reorder", 6, sizeof(unsigned int)* _RADIX * 16);
93 
94  int maxmemcache=max(512,16 * 16 * _RADIX / 512);
95  programCL_->KernelSetLocalArgument("scanhistograms", 1, sizeof(unsigned int)* maxmemcache);
96 
97  } catch (clfException &e) {
98  cout << e.GetDetailedString() << endl;
99  }
100 
101  // construction of the initial permutation
102  for(int i = 0; i < (1<<23); i++){
103  h_initialPermut[i] = i;
104  }
105 }
106 
107 void clfRadixSort::SetData(int num, float *data, int scale) {
108  Resize(num);
109  for (int i=0;i<num;i++) {
110  h_Keys[i] = (int)(data[i]*scale);
111  }
112  // construction of the initial permutation
113  for(int i = 0; i < num; i++){
114  h_Permut[i] = i;
115  }
116  Host2GPU();
117 }
118 
119 void clfRadixSort::SetData(int num, clfBuffer *data) {
120  Resize(num);
121  data->CopyBuffer(*d_inKeys, 0,0, num*sizeof(unsigned int));
122  d_inPermut->WriteToBuffer( h_initialPermut, 0, sizeof(unsigned int) * num);
123  context_->Finish();
124 }
125 
126 // resize the sorted vector
127 void clfRadixSort::Resize(int nn){
128 
129  assert(nn <= (1<<23));
130 
131  nkeys=nn;
132 
133  // length of the vector has to be divisible by (16 * 16)
134  int reste=nkeys % (16 * 16);
136  unsigned int pad[16 * 16];
137  for(int ii=0;ii<16 * 16;ii++){
138  pad[ii]=_MAXINT-(unsigned int)1;
139  }
140  if (reste !=0) {
141  nkeys_rounded=nkeys-reste+(16 * 16);
142  // pad the vector with big values
143  assert(nkeys_rounded <= (1<<23));
144  d_inKeys->WriteToBuffer( pad, sizeof(unsigned int)*nkeys, sizeof(unsigned int) *(16 * 16 - reste));
145  }
146 
147 }
148 
149 // transpose the list for faster memory access
150 void clfRadixSort::Transpose(int nbrow,int nbcol){
151 
152  int tilesize=32;
153 
154  // if the matrix is too small, avoid using local memory
155  if (nbrow%tilesize != 0) tilesize=1;
156  if (nbcol%tilesize != 0) tilesize=1;
157 
158  if (tilesize == 1) {
159  cout << "Warning, small list, avoiding cache..."<<endl;
160  }
161 
162  programCL_->KernelSetArgument("transpose", 0, *d_inKeys);
163  programCL_->KernelSetArgument("transpose", 1, *d_outKeys);
164  programCL_->KernelSetArgument("transpose", 2, nbcol);
165  programCL_->KernelSetArgument("transpose", 3, nbrow);
166  programCL_->KernelSetArgument("transpose", 4, *d_inPermut);
167  programCL_->KernelSetArgument("transpose", 5, *d_outPermut);
168  programCL_->KernelSetLocalArgument("transpose", 6, sizeof(unsigned int)*tilesize*tilesize);
169  programCL_->KernelSetLocalArgument("transpose", 7, sizeof(unsigned int)*tilesize*tilesize);
170  programCL_->KernelSetArgument("transpose", 8, tilesize);
171 
172  size_t global_work_size[2];
173  size_t local_work_size[2];
174 
175  assert(nbrow%tilesize == 0);
176  assert(nbcol%tilesize == 0);
177 
178  global_work_size[0]=nbrow/tilesize;
179  global_work_size[1]=nbcol;
180 
181  local_work_size[0]=1;
182  local_work_size[1]=tilesize;
183 
184  context_->RunOn2DRange(*programCL_, "transpose", global_work_size[0], global_work_size[1], local_work_size[0], local_work_size[1]);
185  context_->Finish();
186  //exchange the pointers
187 
188  // swap the old and new vectors of keys
189  // swap the old and new vectors of keys
191  d_temp=d_inKeys;
194 
195  // swap the old and new permutations
196  d_temp=d_inPermut;
199 
200 }
201 
202 // global sorting algorithm
203 
205 
206  assert(nkeys_rounded <= (1<<23));
207  assert(nkeys <= nkeys_rounded);
208  int nbcol=nkeys_rounded/(16 * 16);
209  int nbrow= 16 * 16;
210 
211  if (TRANSPOSE){
212  Transpose(nbrow,nbcol);
213  }
214 
215  for(unsigned int pass=0;pass<_PASS;pass++){
216  //for(unsigned int pass=0;pass<1;pass++){
217  Histogram(pass);
218  ScanHistogram();
219  Reorder(pass);
220  }
221 
222  if (TRANSPOSE){
223  Transpose(nbcol,nbrow);
224  }
225 
227 }
228 
229 void clfRadixSort::ApplyPermutation(clfBuffer *in, clfBuffer *out, int elemsize, int numElems, bool hightolow) {
230  string kname = "applypermutation";
231  if (hightolow) kname = "applypermutationHighToLow";
232  programCL_->KernelSetArgument(kname, 0, *in);
233  programCL_->KernelSetArgument(kname, 1, *out);
235  programCL_->KernelSetArgument(kname, 3, elemsize);
236  programCL_->KernelSetArgument(kname, 4, numElems);
237  context_->RunOn1DRange(*programCL_, kname, context_->DivUp(32, numElems), 32);
238 }
239 
240 // check the computation at the end
242 
243  cout << "Get the data from the GPU"<<endl;
244 
245  RecupGPU();
246 
247  cout << "Test order"<<endl;
248 
249  // first see if the final list is ordered
250  for(unsigned int i=0;i<nkeys-1;i++){
251  if (!(h_Keys[i] <= h_Keys[i+1])) {
252  cout <<"erreur tri "<< i<<" "<<h_Keys[i]<<" ,"<<i+1<<" "<<h_Keys[i+1]<<endl;
253  }
254  assert(h_Keys[i] <= h_Keys[i+1]);
255  }
256 
257  cout << "test OK !"<<endl;
258 
259 }
260 
262 {
263  delete d_inKeys;
264  delete d_outKeys;
265  delete d_Histograms;
266  delete d_globsum;
267  delete d_inPermut;
268  delete d_outPermut;
269 }
270 
271 
272 // get the data from the GPU
274  d_inKeys->ReadFromBuffer( h_Keys, 0, sizeof(unsigned int)*(1<<23));
275  d_inPermut->ReadFromBuffer( h_Permut, 0, sizeof(unsigned int)*(1<<23));
276  d_Histograms->ReadFromBuffer( h_Histograms, 0, sizeof(unsigned int) * _RADIX * 16 * 16);
277  d_globsum->ReadFromBuffer( h_globsum, 0, sizeof(unsigned int) * 512);
278  context_->Finish();
279 }
280 
281 // put the data to the GPU
283  d_inKeys->WriteToBuffer( h_Keys, 0, sizeof(unsigned int) * (1<<23));
284  d_inPermut->WriteToBuffer( h_Permut, 0, sizeof(unsigned int) * (1<<23));
285  context_->Finish();
286 }
287 
288 unsigned int *clfRadixSort::GetPermutation(unsigned int num) {
289  if (num == 0)
290  d_inPermut->ReadFromBuffer( h_Permut, 0, sizeof(unsigned int)*(1<<23));
291  else {
292  d_inPermut->ReadFromBuffer( h_Permut, 0, sizeof(unsigned int)*num);
293  }
294  return h_Permut;
295 }
296 
297 
298 // compute the histograms
299 void clfRadixSort::Histogram(unsigned int pass){
300 
301  size_t nblocitems=16;
302  size_t nbitems=16*16;
303 
304  assert(_RADIX == pow(float(2),int(5)));
305 
306  programCL_->KernelSetArgument("histogram", 0, *d_inKeys);
307  programCL_->KernelSetArgument("histogram", 2, (int)pass);
308 
309  assert( nkeys_rounded%(16 * 16) == 0);
310  assert( nkeys_rounded <= (1<<23));
311 
312  programCL_->KernelSetArgument("histogram", 4, (int)nkeys_rounded);
313 
314  context_->RunOn1DRange(*programCL_, "histogram", nbitems, nblocitems);
315  context_->Finish();
316 }
317 
318 // scan the histograms
320 
321  // numbers of processors for the local scan
322  // half the size of the local histograms
323  size_t nbitems=_RADIX* 16*16 / 2;
324 
325 
326  size_t nblocitems= nbitems/512 ;
327 
328 
329  // scan locally the histogram (the histogram is split into several
330  // parts that fit into the local memory)
331 
332  programCL_->KernelSetArgument("scanhistograms", 0, *d_Histograms);
333  programCL_->KernelSetArgument("scanhistograms", 2, *d_globsum);
334 
335  context_->RunOn1DRange(*programCL_, "scanhistograms", nbitems, nblocitems);
336  context_->Finish();
337 
338  programCL_->KernelSetArgument("scanhistograms", 0, *d_globsum);
339  programCL_->KernelSetArgument("scanhistograms", 2, *d_temp);
340 
341  nbitems= 512 / 2;
342  nblocitems=nbitems;
343  //nblocitems=1;
344 
345  context_->RunOn1DRange(*programCL_, "scanhistograms", nbitems, nblocitems);
346  context_->Finish();
347 
348  // loops again in order to paste together the local histograms
349  nbitems = _RADIX* 16*16/2;
350  nblocitems=nbitems/512;
351 
352  context_->RunOn1DRange(*programCL_, "pastehistograms", nbitems, nblocitems);
353  context_->Finish();
354 }
355 
356 // reorder the data from the scanned histogram
357 void clfRadixSort::Reorder(unsigned int pass){
358 
359  size_t nblocitems=16;
360  size_t nbitems=16*16;
361 
362  programCL_->KernelSetArgument("reorder", 0, *d_inKeys);
363  programCL_->KernelSetArgument("reorder", 1, *d_outKeys);
364  programCL_->KernelSetArgument("reorder", 3, (int)pass);
365  programCL_->KernelSetArgument("reorder", 4, *d_inPermut);
366  programCL_->KernelSetArgument("reorder", 5, *d_outPermut);
367  programCL_->KernelSetArgument("reorder", 7, (int)nkeys_rounded);
368 
369  assert(_RADIX == pow(float(2),int(5)));
370 
371  context_->RunOn1DRange(*programCL_, "reorder", nbitems, nblocitems);
372  context_->Finish();
373 
374  // swap the old and new vectors of keys
376  d_temp=d_inKeys;
379 
380  // swap the old and new permutations
381  d_temp=d_inPermut;
384 
385 }
unsigned int h_Histograms[5 *16 *16]
Definition: clfRadixSort.hh:68
clfProgram * programCL_
Definition: clfAlgorithm.hh:35
clfBuffer * d_temp
Definition: clfRadixSort.hh:75
clfBuffer * CreateBuffer()
create buffer object
Definition: clfContext.hh:79
std::string AddSource(std::string filename)
adds source code from a file
Definition: clfProgram.cpp:64
unsigned int h_Permut[(1<< 23)]
Definition: clfRadixSort.hh:86
void ApplyPermutation(clfBuffer *in, clfBuffer *out, int elemsize, int numElems, bool hightolow=false)
void KernelSetLocalArgument(std::string kernelname, unsigned int argnumber, int size)
Definition: clfProgram.cpp:244
OpenCL Buffer wrapper.
Definition: clfBuffer.hh:43
void Histogram(unsigned int pass)
void SetData(int num, float *data, int scale=1)
clfBuffer * d_Histograms
Definition: clfRadixSort.hh:69
void Reorder(unsigned int pass)
clfBuffer * d_inPermut
Definition: clfRadixSort.hh:88
void Build(int deviceNr=0, std::string options="")
builds the sources added by AddSource and AddSourceFromString
Definition: clfProgram.cpp:115
const std::string & GetDetailedString() const
detailed combination of all info available
clfBuffer * d_outPermut
Definition: clfRadixSort.hh:89
OpenCL Context wrapper.
Definition: clfContext.hh:49
void ReadFromBuffer(void *data, unsigned int offset=0, unsigned int size=0)
read from buffer object to host memory
Definition: clfBuffer.cpp:128
void Transpose(int nbrow, int nbcol)
clf Exception wrapper, is thrown in case of most clf errors
Definition: clfException.hh:48
void Finish()
force finishing the command queue
Definition: clfContext.cpp:289
void Allocate(unsigned int bufsize, bool readonly=false, bool writeonly=false, void *hostptr=NULL, bool copy=false)
Allocation of a memory buffer A memory buffer can be created on device or host, it can be initialized...
Definition: clfBuffer.cpp:45
unsigned int h_globsum[512]
Definition: clfRadixSort.hh:73
void KernelSetArgument(std::string kernelname, unsigned int argnumber, clfBuffer &buffer)
set kernel argument
Definition: clfProgram.cpp:177
void Resize(int nn)
void AddKernel(std::string kernelname)
adds a kernel to the program.
Definition: clfProgram.cpp:158
unsigned int nkeys_rounded
Definition: clfRadixSort.hh:79
void RunOn2DRange(clfProgram &program, std::string kernelname, unsigned int globalrangeX, unsigned int globalrangeY, unsigned int localrangeX=0, unsigned int localrangeY=0)
run a kernel on a 2D memory range
Definition: clfContext.cpp:234
unsigned int * GetPermutation(unsigned int num=0)
clfBuffer * d_outKeys
Definition: clfRadixSort.hh:83
clfContext * context_
Definition: clfAlgorithm.hh:34
void CopyBuffer(clfBuffer &outputbuffer, unsigned int srcoffset=0, unsigned int dstoffset=0, unsigned int size=0)
copy from one buffer to another
Definition: clfBuffer.cpp:139
void WriteToBuffer(const void *data, unsigned int offset=0, unsigned int size=0)
write from host memory to buffer object
Definition: clfBuffer.cpp:117
unsigned int nkeys
Definition: clfRadixSort.hh:78
unsigned int h_initialPermut[(1<< 23)]
Definition: clfRadixSort.hh:87
void RunOn1DRange(clfProgram &program, std::string kernelname, unsigned int globalrange, unsigned int localrange=0)
run a kernel on a 1D memory range
Definition: clfContext.cpp:220
void ScanHistogram(void)
clfBuffer * d_globsum
Definition: clfRadixSort.hh:74
clfBuffer * d_inKeys
Definition: clfRadixSort.hh:82
unsigned int h_Keys[(1<< 23)]
Definition: clfRadixSort.hh:81
int DivUp(const int mod, int val)
Definition: clfContext.cpp:416