Point Cloud Library (PCL)  1.9.1-dev
utils.hpp
1 /*
2  * Software License Agreement (BSD License)
3  *
4  * Point Cloud Library (PCL) - www.pointclouds.org
5  * Copyright (c) 2011, Willow Garage, Inc.
6  *
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * * Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * * Redistributions in binary form must reproduce the above
16  * copyright notice, this list of conditions and the following
17  * disclaimer in the documentation and/or other materials provided
18  * with the distribution.
19  * * Neither the name of Willow Garage, Inc. nor the names of its
20  * contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  *
36  */
37 
38 
39 #ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_
40 #define PCL_GPU_KINFU_CUDA_UTILS_HPP_
41 
42 #include <cuda.h>
43 
44 namespace pcl
45 {
46  namespace device
47  {
48  template <class T>
49  __device__ __host__ __forceinline__ void swap ( T& a, T& b )
50  {
51  T c(a); a=b; b=c;
52  }
53 
54  template<typename T> struct numeric_limits;
55 
56  template<> struct numeric_limits<float>
57  {
58  __device__ __forceinline__ static float
59  quiet_NaN() { return __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ };
60  __device__ __forceinline__ static float
61  epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
62 
63  __device__ __forceinline__ static float
64  min() { return 1.175494351e-38f/*FLT_MIN*/; };
65  __device__ __forceinline__ static float
66  max() { return 3.402823466e+38f/*FLT_MAX*/; };
67  };
68 
69  template<> struct numeric_limits<short>
70  {
71  __device__ __forceinline__ static short
72  max() { return SHRT_MAX; };
73  };
74 
75  __device__ __forceinline__ float
76  dot(const float3& v1, const float3& v2)
77  {
78  return v1.x * v2.x + v1.y*v2.y + v1.z*v2.z;
79  }
80 
81  __device__ __forceinline__ float3&
82  operator+=(float3& vec, const float& v)
83  {
84  vec.x += v; vec.y += v; vec.z += v; return vec;
85  }
86 
87  __device__ __forceinline__ float3
88  operator+(const float3& v1, const float3& v2)
89  {
90  return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
91  }
92 
93  __device__ __forceinline__ float3&
94  operator*=(float3& vec, const float& v)
95  {
96  vec.x *= v; vec.y *= v; vec.z *= v; return vec;
97  }
98 
99  __device__ __forceinline__ float3
100  operator-(const float3& v1, const float3& v2)
101  {
102  return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
103  }
104 
105  __device__ __forceinline__ float3
106  operator*(const float3& v1, const float& v)
107  {
108  return make_float3(v1.x * v, v1.y * v, v1.z * v);
109  }
110 
111  __device__ __forceinline__ float
112  norm(const float3& v)
113  {
114  return sqrt(dot(v, v));
115  }
116 
117  __device__ __forceinline__ float3
118  normalized(const float3& v)
119  {
120  return v * rsqrt(dot(v, v));
121  }
122 
123  __device__ __host__ __forceinline__ float3
124  cross(const float3& v1, const float3& v2)
125  {
126  return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
127  }
128 
129  __device__ __forceinline__ void computeRoots2(const float& b, const float& c, float3& roots)
130  {
131  roots.x = 0.f;
132  float d = b * b - 4.f * c;
133  if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
134  d = 0.f;
135 
136  float sd = sqrtf(d);
137 
138  roots.z = 0.5f * (b + sd);
139  roots.y = 0.5f * (b - sd);
140  }
141 
142  __device__ __forceinline__ void
143  computeRoots3(float c0, float c1, float c2, float3& roots)
144  {
145  if ( fabsf(c0) < numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
146  {
147  computeRoots2 (c2, c1, roots);
148  }
149  else
150  {
151  const float s_inv3 = 1.f/3.f;
152  const float s_sqrt3 = sqrtf(3.f);
153  // Construct the parameters used in classifying the roots of the equation
154  // and in solving the equation for the roots in closed form.
155  float c2_over_3 = c2 * s_inv3;
156  float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
157  if (a_over_3 > 0.f)
158  a_over_3 = 0.f;
159 
160  float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
161 
162  float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
163  if (q > 0.f)
164  q = 0.f;
165 
166  // Compute the eigenvalues by solving for the roots of the polynomial.
167  float rho = sqrtf(-a_over_3);
168  float theta = atan2f (sqrtf (-q), half_b)*s_inv3;
169  float cos_theta = __cosf (theta);
170  float sin_theta = __sinf (theta);
171  roots.x = c2_over_3 + 2.f * rho * cos_theta;
172  roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
173  roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
174 
175  // Sort in increasing order.
176  if (roots.x >= roots.y)
177  swap(roots.x, roots.y);
178 
179  if (roots.y >= roots.z)
180  {
181  swap(roots.y, roots.z);
182 
183  if (roots.x >= roots.y)
184  swap (roots.x, roots.y);
185  }
186  if (roots.x <= 0) // eigenval for symmetric positive semi-definite matrix can not be negative! Set it to 0
187  computeRoots2 (c2, c1, roots);
188  }
189  }
190 
191  struct Eigen33
192  {
193  public:
194  template<int Rows>
195  struct MiniMat
196  {
197  float3 data[Rows];
198  __device__ __host__ __forceinline__ float3& operator[](int i) { return data[i]; }
199  __device__ __host__ __forceinline__ const float3& operator[](int i) const { return data[i]; }
200  };
201  typedef MiniMat<3> Mat33;
202  typedef MiniMat<4> Mat43;
203 
204 
205  static __forceinline__ __device__ float3
206  unitOrthogonal (const float3& src)
207  {
208  float3 perp;
209  /* Let us compute the crossed product of *this with a vector
210  * that is not too close to being colinear to *this.
211  */
212 
213  /* unless the x and y coords are both close to zero, we can
214  * simply take ( -y, x, 0 ) and normalize it.
215  */
216  if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
217  {
218  float invnm = rsqrtf(src.x*src.x + src.y*src.y);
219  perp.x = -src.y * invnm;
220  perp.y = src.x * invnm;
221  perp.z = 0.0f;
222  }
223  /* if both x and y are close to zero, then the vector is close
224  * to the z-axis, so it's far from colinear to the x-axis for instance.
225  * So we take the crossed product with (1,0,0) and normalize it.
226  */
227  else
228  {
229  float invnm = rsqrtf(src.z * src.z + src.y * src.y);
230  perp.x = 0.0f;
231  perp.y = -src.z * invnm;
232  perp.z = src.y * invnm;
233  }
234 
235  return perp;
236  }
237 
238  __device__ __forceinline__
239  Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
240  __device__ __forceinline__ void
241  compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
242  {
243  // Scale the matrix so its entries are in [-1,1]. The scaling is applied
244  // only when at least one matrix entry has magnitude larger than 1.
245 
246  float max01 = fmaxf( fabsf(mat_pkg[0]), fabsf(mat_pkg[1]) );
247  float max23 = fmaxf( fabsf(mat_pkg[2]), fabsf(mat_pkg[3]) );
248  float max45 = fmaxf( fabsf(mat_pkg[4]), fabsf(mat_pkg[5]) );
249  float m0123 = fmaxf( max01, max23);
250  float scale = fmaxf( max45, m0123);
251 
252  if (scale <= numeric_limits<float>::min())
253  scale = 1.f;
254 
255  mat_pkg[0] /= scale;
256  mat_pkg[1] /= scale;
257  mat_pkg[2] /= scale;
258  mat_pkg[3] /= scale;
259  mat_pkg[4] /= scale;
260  mat_pkg[5] /= scale;
261 
262  // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The
263  // eigenvalues are the roots to this equation, all guaranteed to be
264  // real-valued, because the matrix is symmetric.
265  float c0 = m00() * m11() * m22()
266  + 2.f * m01() * m02() * m12()
267  - m00() * m12() * m12()
268  - m11() * m02() * m02()
269  - m22() * m01() * m01();
270  float c1 = m00() * m11() -
271  m01() * m01() +
272  m00() * m22() -
273  m02() * m02() +
274  m11() * m22() -
275  m12() * m12();
276  float c2 = m00() + m11() + m22();
277 
278  computeRoots3(c0, c1, c2, evals);
279 
280  if(evals.z - evals.x <= numeric_limits<float>::epsilon())
281  {
282  evecs[0] = make_float3(1.f, 0.f, 0.f);
283  evecs[1] = make_float3(0.f, 1.f, 0.f);
284  evecs[2] = make_float3(0.f, 0.f, 1.f);
285  }
286  else if (evals.y - evals.x <= numeric_limits<float>::epsilon() )
287  {
288  // first and second equal
289  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
290  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
291 
292  vec_tmp[0] = cross(tmp[0], tmp[1]);
293  vec_tmp[1] = cross(tmp[0], tmp[2]);
294  vec_tmp[2] = cross(tmp[1], tmp[2]);
295 
296  float len1 = dot (vec_tmp[0], vec_tmp[0]);
297  float len2 = dot (vec_tmp[1], vec_tmp[1]);
298  float len3 = dot (vec_tmp[2], vec_tmp[2]);
299 
300  if (len1 >= len2 && len1 >= len3)
301  {
302  evecs[2] = vec_tmp[0] * rsqrtf (len1);
303  }
304  else if (len2 >= len1 && len2 >= len3)
305  {
306  evecs[2] = vec_tmp[1] * rsqrtf (len2);
307  }
308  else
309  {
310  evecs[2] = vec_tmp[2] * rsqrtf (len3);
311  }
312 
313  evecs[1] = unitOrthogonal(evecs[2]);
314  evecs[0] = cross(evecs[1], evecs[2]);
315  }
316  else if (evals.z - evals.y <= numeric_limits<float>::epsilon() )
317  {
318  // second and third equal
319  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
320  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
321 
322  vec_tmp[0] = cross(tmp[0], tmp[1]);
323  vec_tmp[1] = cross(tmp[0], tmp[2]);
324  vec_tmp[2] = cross(tmp[1], tmp[2]);
325 
326  float len1 = dot(vec_tmp[0], vec_tmp[0]);
327  float len2 = dot(vec_tmp[1], vec_tmp[1]);
328  float len3 = dot(vec_tmp[2], vec_tmp[2]);
329 
330  if (len1 >= len2 && len1 >= len3)
331  {
332  evecs[0] = vec_tmp[0] * rsqrtf(len1);
333  }
334  else if (len2 >= len1 && len2 >= len3)
335  {
336  evecs[0] = vec_tmp[1] * rsqrtf(len2);
337  }
338  else
339  {
340  evecs[0] = vec_tmp[2] * rsqrtf(len3);
341  }
342 
343  evecs[1] = unitOrthogonal( evecs[0] );
344  evecs[2] = cross(evecs[0], evecs[1]);
345  }
346  else
347  {
348 
349  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
350  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
351 
352  vec_tmp[0] = cross(tmp[0], tmp[1]);
353  vec_tmp[1] = cross(tmp[0], tmp[2]);
354  vec_tmp[2] = cross(tmp[1], tmp[2]);
355 
356  float len1 = dot(vec_tmp[0], vec_tmp[0]);
357  float len2 = dot(vec_tmp[1], vec_tmp[1]);
358  float len3 = dot(vec_tmp[2], vec_tmp[2]);
359 
360  float mmax[3];
361 
362  unsigned int min_el = 2;
363  unsigned int max_el = 2;
364  if (len1 >= len2 && len1 >= len3)
365  {
366  mmax[2] = len1;
367  evecs[2] = vec_tmp[0] * rsqrtf (len1);
368  }
369  else if (len2 >= len1 && len2 >= len3)
370  {
371  mmax[2] = len2;
372  evecs[2] = vec_tmp[1] * rsqrtf (len2);
373  }
374  else
375  {
376  mmax[2] = len3;
377  evecs[2] = vec_tmp[2] * rsqrtf (len3);
378  }
379 
380  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
381  tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
382 
383  vec_tmp[0] = cross(tmp[0], tmp[1]);
384  vec_tmp[1] = cross(tmp[0], tmp[2]);
385  vec_tmp[2] = cross(tmp[1], tmp[2]);
386 
387  len1 = dot(vec_tmp[0], vec_tmp[0]);
388  len2 = dot(vec_tmp[1], vec_tmp[1]);
389  len3 = dot(vec_tmp[2], vec_tmp[2]);
390 
391  if (len1 >= len2 && len1 >= len3)
392  {
393  mmax[1] = len1;
394  evecs[1] = vec_tmp[0] * rsqrtf (len1);
395  min_el = len1 <= mmax[min_el] ? 1 : min_el;
396  max_el = len1 > mmax[max_el] ? 1 : max_el;
397  }
398  else if (len2 >= len1 && len2 >= len3)
399  {
400  mmax[1] = len2;
401  evecs[1] = vec_tmp[1] * rsqrtf (len2);
402  min_el = len2 <= mmax[min_el] ? 1 : min_el;
403  max_el = len2 > mmax[max_el] ? 1 : max_el;
404  }
405  else
406  {
407  mmax[1] = len3;
408  evecs[1] = vec_tmp[2] * rsqrtf (len3);
409  min_el = len3 <= mmax[min_el] ? 1 : min_el;
410  max_el = len3 > mmax[max_el] ? 1 : max_el;
411  }
412 
413  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
414  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
415 
416  vec_tmp[0] = cross(tmp[0], tmp[1]);
417  vec_tmp[1] = cross(tmp[0], tmp[2]);
418  vec_tmp[2] = cross(tmp[1], tmp[2]);
419 
420  len1 = dot (vec_tmp[0], vec_tmp[0]);
421  len2 = dot (vec_tmp[1], vec_tmp[1]);
422  len3 = dot (vec_tmp[2], vec_tmp[2]);
423 
424 
425  if (len1 >= len2 && len1 >= len3)
426  {
427  mmax[0] = len1;
428  evecs[0] = vec_tmp[0] * rsqrtf (len1);
429  min_el = len3 <= mmax[min_el] ? 0 : min_el;
430  max_el = len3 > mmax[max_el] ? 0 : max_el;
431  }
432  else if (len2 >= len1 && len2 >= len3)
433  {
434  mmax[0] = len2;
435  evecs[0] = vec_tmp[1] * rsqrtf (len2);
436  min_el = len3 <= mmax[min_el] ? 0 : min_el;
437  max_el = len3 > mmax[max_el] ? 0 : max_el;
438  }
439  else
440  {
441  mmax[0] = len3;
442  evecs[0] = vec_tmp[2] * rsqrtf (len3);
443  min_el = len3 <= mmax[min_el] ? 0 : min_el;
444  max_el = len3 > mmax[max_el] ? 0 : max_el;
445  }
446 
447  unsigned mid_el = 3 - min_el - max_el;
448  evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
449  evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
450  }
451  // Rescale back to the original size.
452  evals *= scale;
453  }
454  private:
455  volatile float* mat_pkg;
456 
457  __device__ __forceinline__ float m00() const { return mat_pkg[0]; }
458  __device__ __forceinline__ float m01() const { return mat_pkg[1]; }
459  __device__ __forceinline__ float m02() const { return mat_pkg[2]; }
460  __device__ __forceinline__ float m10() const { return mat_pkg[1]; }
461  __device__ __forceinline__ float m11() const { return mat_pkg[3]; }
462  __device__ __forceinline__ float m12() const { return mat_pkg[4]; }
463  __device__ __forceinline__ float m20() const { return mat_pkg[2]; }
464  __device__ __forceinline__ float m21() const { return mat_pkg[4]; }
465  __device__ __forceinline__ float m22() const { return mat_pkg[5]; }
466 
467  __device__ __forceinline__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
468  __device__ __forceinline__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
469  __device__ __forceinline__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
470 
471  __device__ __forceinline__ static bool isMuchSmallerThan (float x, float y)
472  {
473  // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
475  return x * x <= prec_sqr * y * y;
476  }
477  };
478 
479  struct Block
480  {
481  static __device__ __forceinline__ unsigned int stride()
482  {
483  return blockDim.x * blockDim.y * blockDim.z;
484  }
485 
486  static __device__ __forceinline__ int
488  {
489  return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
490  }
491 
492  template<int CTA_SIZE, typename T, class BinOp>
493  static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
494  {
495  int tid = flattenedThreadId();
496  T val = buffer[tid];
497 
498  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
499  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
500  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
501  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
502 
503  if (tid < 32)
504  {
505  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
506  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
507  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
508  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
509  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
510  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
511  }
512  }
513 
514  template<int CTA_SIZE, typename T, class BinOp>
515  static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
516  {
517  int tid = flattenedThreadId();
518  T val = buffer[tid] = init;
519  __syncthreads();
520 
521  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
522  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
523  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
524  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
525 
526  if (tid < 32)
527  {
528  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
529  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
530  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
531  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
532  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
533  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
534  }
535  __syncthreads();
536  return buffer[0];
537  }
538  };
539 
540  struct Warp
541  {
542  enum
543  {
544  LOG_WARP_SIZE = 5,
545  WARP_SIZE = 1 << LOG_WARP_SIZE,
546  STRIDE = WARP_SIZE
547  };
548 
549  /** \brief Returns the warp lane ID of the calling thread. */
550  static __device__ __forceinline__ unsigned int
552  {
553  unsigned int ret;
554  asm("mov.u32 %0, %laneid;" : "=r"(ret) );
555  return ret;
556  }
557 
558  static __device__ __forceinline__ unsigned int id()
559  {
560  int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
561  return tid >> LOG_WARP_SIZE;
562  }
563 
564  static __device__ __forceinline__
566  {
567 #if (__CUDA_ARCH__ >= 200)
568  unsigned int ret;
569  asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
570  return ret;
571 #else
572  return 0xFFFFFFFF >> (32 - laneId());
573 #endif
574  }
575 
576  static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
577  {
578  return __popc(Warp::laneMaskLt() & ballot_mask);
579  }
580  };
581 
582 
583  struct Emulation
584  {
585  static __device__ __forceinline__ int
586  warp_reduce ( volatile int *ptr , const unsigned int tid)
587  {
588  const unsigned int lane = tid & 31; // index of thread in warp (0..31)
589 
590  if (lane < 16)
591  {
592  int partial = ptr[tid];
593 
594  ptr[tid] = partial = partial + ptr[tid + 16];
595  ptr[tid] = partial = partial + ptr[tid + 8];
596  ptr[tid] = partial = partial + ptr[tid + 4];
597  ptr[tid] = partial = partial + ptr[tid + 2];
598  ptr[tid] = partial = partial + ptr[tid + 1];
599  }
600  return ptr[tid - lane];
601  }
602 
603  static __forceinline__ __device__ int
604  Ballot(int predicate, volatile int* cta_buffer)
605  {
606 #if CUDA_VERSION >= 9000
607  (void)cta_buffer;
608  return __ballot_sync (__activemask (), predicate);
609 #elif __CUDA_ARCH__ >= 200
610  (void)cta_buffer;
611  return __ballot(predicate);
612 #else
613  int tid = Block::flattenedThreadId();
614  cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
615  return warp_reduce(cta_buffer, tid);
616 #endif
617  }
618 
619  static __forceinline__ __device__ bool
620  All(int predicate, volatile int* cta_buffer)
621  {
622 #if CUDA_VERSION >= 9000
623  (void)cta_buffer;
624  return __all_sync (__activemask (), predicate);
625 #elif __CUDA_ARCH__ >= 200
626  (void)cta_buffer;
627  return __all(predicate);
628 #else
629  int tid = Block::flattenedThreadId();
630  cta_buffer[tid] = predicate ? 1 : 0;
631  return warp_reduce(cta_buffer, tid) == 32;
632 #endif
633  }
634  };
635  }
636 }
637 
638 #endif /* PCL_GPU_KINFU_CUDA_UTILS_HPP_ */
__device__ __host__ __forceinline__ void swap(T &a, T &b)
Definition: utils.hpp:49
static __device__ __forceinline__ unsigned int stride()
Definition: utils.hpp:481
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: utils.hpp:493
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
Definition: utils.hpp:82
This file defines compatibility wrappers for low level I/O functions.
Definition: convolution.h:44
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: utils.hpp:515
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
Definition: utils.hpp:100
__device__ static __forceinline__ float quiet_NaN()
Definition: utils.hpp:59
__device__ __forceinline__ float3 normalized(const float3 &v)
Definition: utils.hpp:118
__device__ __forceinline__ T warp_reduce(volatile T *ptr, const unsigned int tid=threadIdx.x)
Definition: warp_reduce.hpp:46
MiniMat< 4 > Mat43
Definition: utils.hpp:202
__device__ __host__ __forceinline__ const float3 & operator[](int i) const
Definition: utils.hpp:199
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
Definition: device.hpp:76
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
Definition: utils.hpp:124
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
Definition: utils.hpp:576
__device__ static __forceinline__ float max()
Definition: utils.hpp:66
static __device__ __forceinline__ unsigned int id()
Definition: utils.hpp:558
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
Definition: utils.hpp:241
static __device__ __forceinline__ int laneMaskLt()
Definition: utils.hpp:565
__device__ static __forceinline__ float min()
Definition: utils.hpp:64
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
Definition: utils.hpp:76
__device__ static __forceinline__ type epsilon()
Definition: limits.hpp:49
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:620
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:604
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
Definition: utils.hpp:206
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
Definition: utils.hpp:94
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
Definition: eigen.hpp:101
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
Definition: utils.hpp:88
static __device__ __forceinline__ int flattenedThreadId()
Definition: utils.hpp:487
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: utils.hpp:551
__device__ __host__ __forceinline__ float norm(const float3 &v1, const float3 &v2)
__device__ static __forceinline__ float epsilon()
Definition: utils.hpp:61
__device__ static __forceinline__ short max()
Definition: utils.hpp:72
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
Definition: utils.hpp:239
MiniMat< 3 > Mat33
Definition: utils.hpp:201
__device__ __host__ __forceinline__ float3 & operator[](int i)
Definition: utils.hpp:198
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
Definition: eigen.hpp:114
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)
Definition: utils.hpp:586