Point Cloud Library (PCL)  1.9.1-dev
utils.hpp
1 /*
2  * Software License Agreement (BSD License)
3  *
4  * Point Cloud Library (PCL) - www.pointclouds.org
5  * Copyright (c) 2011, Willow Garage, Inc.
6  *
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * * Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * * Redistributions in binary form must reproduce the above
16  * copyright notice, this list of conditions and the following
17  * disclaimer in the documentation and/or other materials provided
18  * with the distribution.
19  * * Neither the name of Willow Garage, Inc. nor the names of its
20  * contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  *
36  */
37 
38 
39 #ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_
40 #define PCL_GPU_KINFU_CUDA_UTILS_HPP_
41 
42 #include <limits>
43 
44 #include <cuda.h>
45 
46 namespace pcl
47 {
48  namespace device
49  {
50  namespace kinfuLS
51  {
52  template <class T>
53  __device__ __host__ __forceinline__ void swap ( T& a, T& b )
54  {
55  T c(a); a=b; b=c;
56  }
57 
58  __device__ __forceinline__ float
59  dot(const float3& v1, const float3& v2)
60  {
61  return v1.x * v2.x + v1.y*v2.y + v1.z*v2.z;
62  }
63 
64  __device__ __forceinline__ float3&
65  operator+=(float3& vec, const float& v)
66  {
67  vec.x += v; vec.y += v; vec.z += v; return vec;
68  }
69 
70  __device__ __forceinline__ float3
71  operator+(const float3& v1, const float3& v2)
72  {
73  return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
74  }
75 
76  __device__ __forceinline__ float3&
77  operator*=(float3& vec, const float& v)
78  {
79  vec.x *= v; vec.y *= v; vec.z *= v; return vec;
80  }
81 
82  __device__ __forceinline__ float3
83  operator-(const float3& v1, const float3& v2)
84  {
85  return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
86  }
87 
88  __device__ __forceinline__ float3
89  operator*(const float3& v1, const float& v)
90  {
91  return make_float3(v1.x * v, v1.y * v, v1.z * v);
92  }
93 
94  __device__ __forceinline__ float
95  norm(const float3& v)
96  {
97  return sqrt(dot(v, v));
98  }
99 
100  __device__ __forceinline__ float3
101  normalized(const float3& v)
102  {
103  return v * rsqrt(dot(v, v));
104  }
105 
106  __device__ __host__ __forceinline__ float3
107  cross(const float3& v1, const float3& v2)
108  {
109  return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
110  }
111 
112  __device__ __forceinline__ void computeRoots2(const float& b, const float& c, float3& roots)
113  {
114  roots.x = 0.f;
115  float d = b * b - 4.f * c;
116  if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
117  d = 0.f;
118 
119  float sd = sqrtf(d);
120 
121  roots.z = 0.5f * (b + sd);
122  roots.y = 0.5f * (b - sd);
123  }
124 
125  __device__ __forceinline__ void
126  computeRoots3(float c0, float c1, float c2, float3& roots)
127  {
128  if ( std::abs(c0) < std::numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
129  {
130  computeRoots2 (c2, c1, roots);
131  }
132  else
133  {
134  const float s_inv3 = 1.f/3.f;
135  const float s_sqrt3 = sqrtf(3.f);
136  // Construct the parameters used in classifying the roots of the equation
137  // and in solving the equation for the roots in closed form.
138  float c2_over_3 = c2 * s_inv3;
139  float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
140  if (a_over_3 > 0.f)
141  a_over_3 = 0.f;
142 
143  float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
144 
145  float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
146  if (q > 0.f)
147  q = 0.f;
148 
149  // Compute the eigenvalues by solving for the roots of the polynomial.
150  float rho = sqrtf(-a_over_3);
151  float theta = std::atan2 (sqrtf (-q), half_b)*s_inv3;
152  float cos_theta = __cosf (theta);
153  float sin_theta = __sinf (theta);
154  roots.x = c2_over_3 + 2.f * rho * cos_theta;
155  roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
156  roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
157 
158  // Sort in increasing order.
159  if (roots.x >= roots.y)
160  swap(roots.x, roots.y);
161 
162  if (roots.y >= roots.z)
163  {
164  swap(roots.y, roots.z);
165 
166  if (roots.x >= roots.y)
167  swap (roots.x, roots.y);
168  }
169  if (roots.x <= 0) // eigenval for symmetric positive semi-definite matrix can not be negative! Set it to 0
170  computeRoots2 (c2, c1, roots);
171  }
172  }
173 
174  struct Eigen33
175  {
176  public:
177  template<int Rows>
178  struct MiniMat
179  {
180  float3 data[Rows];
181  __device__ __host__ __forceinline__ float3& operator[](int i) { return data[i]; }
182  __device__ __host__ __forceinline__ const float3& operator[](int i) const { return data[i]; }
183  };
184  using Mat33 = MiniMat<3>;
185  using Mat43 = MiniMat<4>;
186 
187 
188  static __forceinline__ __device__ float3
189  unitOrthogonal (const float3& src)
190  {
191  float3 perp;
192  /* Let us compute the crossed product of *this with a vector
193  * that is not too close to being colinear to *this.
194  */
195 
196  /* unless the x and y coords are both close to zero, we can
197  * simply take ( -y, x, 0 ) and normalize it.
198  */
199  if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
200  {
201  float invnm = rsqrtf(src.x*src.x + src.y*src.y);
202  perp.x = -src.y * invnm;
203  perp.y = src.x * invnm;
204  perp.z = 0.0f;
205  }
206  /* if both x and y are close to zero, then the vector is close
207  * to the z-axis, so it's far from colinear to the x-axis for instance.
208  * So we take the crossed product with (1,0,0) and normalize it.
209  */
210  else
211  {
212  float invnm = rsqrtf(src.z * src.z + src.y * src.y);
213  perp.x = 0.0f;
214  perp.y = -src.z * invnm;
215  perp.z = src.y * invnm;
216  }
217 
218  return perp;
219  }
220 
221  __device__ __forceinline__
222  Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
223  __device__ __forceinline__ void
224  compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
225  {
226  // Scale the matrix so its entries are in [-1,1]. The scaling is applied
227  // only when at least one matrix entry has magnitude larger than 1.
228 
229  float max01 = fmaxf( std::abs(mat_pkg[0]), std::abs(mat_pkg[1]) );
230  float max23 = fmaxf( std::abs(mat_pkg[2]), std::abs(mat_pkg[3]) );
231  float max45 = fmaxf( std::abs(mat_pkg[4]), std::abs(mat_pkg[5]) );
232  float m0123 = fmaxf( max01, max23);
233  float scale = fmaxf( max45, m0123);
234 
235  if (scale <= std::numeric_limits<float>::min())
236  scale = 1.f;
237 
238  mat_pkg[0] /= scale;
239  mat_pkg[1] /= scale;
240  mat_pkg[2] /= scale;
241  mat_pkg[3] /= scale;
242  mat_pkg[4] /= scale;
243  mat_pkg[5] /= scale;
244 
245  // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The
246  // eigenvalues are the roots to this equation, all guaranteed to be
247  // real-valued, because the matrix is symmetric.
248  float c0 = m00() * m11() * m22()
249  + 2.f * m01() * m02() * m12()
250  - m00() * m12() * m12()
251  - m11() * m02() * m02()
252  - m22() * m01() * m01();
253  float c1 = m00() * m11() -
254  m01() * m01() +
255  m00() * m22() -
256  m02() * m02() +
257  m11() * m22() -
258  m12() * m12();
259  float c2 = m00() + m11() + m22();
260 
261  computeRoots3(c0, c1, c2, evals);
262 
263  if(evals.z - evals.x <= std::numeric_limits<float>::epsilon())
264  {
265  evecs[0] = make_float3(1.f, 0.f, 0.f);
266  evecs[1] = make_float3(0.f, 1.f, 0.f);
267  evecs[2] = make_float3(0.f, 0.f, 1.f);
268  }
269  else if (evals.y - evals.x <= std::numeric_limits<float>::epsilon() )
270  {
271  // first and second equal
272  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
273  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
274 
275  vec_tmp[0] = cross(tmp[0], tmp[1]);
276  vec_tmp[1] = cross(tmp[0], tmp[2]);
277  vec_tmp[2] = cross(tmp[1], tmp[2]);
278 
279  float len1 = dot (vec_tmp[0], vec_tmp[0]);
280  float len2 = dot (vec_tmp[1], vec_tmp[1]);
281  float len3 = dot (vec_tmp[2], vec_tmp[2]);
282 
283  if (len1 >= len2 && len1 >= len3)
284  {
285  evecs[2] = vec_tmp[0] * rsqrtf (len1);
286  }
287  else if (len2 >= len1 && len2 >= len3)
288  {
289  evecs[2] = vec_tmp[1] * rsqrtf (len2);
290  }
291  else
292  {
293  evecs[2] = vec_tmp[2] * rsqrtf (len3);
294  }
295 
296  evecs[1] = unitOrthogonal(evecs[2]);
297  evecs[0] = cross(evecs[1], evecs[2]);
298  }
299  else if (evals.z - evals.y <= std::numeric_limits<float>::epsilon() )
300  {
301  // second and third equal
302  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
303  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
304 
305  vec_tmp[0] = cross(tmp[0], tmp[1]);
306  vec_tmp[1] = cross(tmp[0], tmp[2]);
307  vec_tmp[2] = cross(tmp[1], tmp[2]);
308 
309  float len1 = dot(vec_tmp[0], vec_tmp[0]);
310  float len2 = dot(vec_tmp[1], vec_tmp[1]);
311  float len3 = dot(vec_tmp[2], vec_tmp[2]);
312 
313  if (len1 >= len2 && len1 >= len3)
314  {
315  evecs[0] = vec_tmp[0] * rsqrtf(len1);
316  }
317  else if (len2 >= len1 && len2 >= len3)
318  {
319  evecs[0] = vec_tmp[1] * rsqrtf(len2);
320  }
321  else
322  {
323  evecs[0] = vec_tmp[2] * rsqrtf(len3);
324  }
325 
326  evecs[1] = unitOrthogonal( evecs[0] );
327  evecs[2] = cross(evecs[0], evecs[1]);
328  }
329  else
330  {
331 
332  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
333  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
334 
335  vec_tmp[0] = cross(tmp[0], tmp[1]);
336  vec_tmp[1] = cross(tmp[0], tmp[2]);
337  vec_tmp[2] = cross(tmp[1], tmp[2]);
338 
339  float len1 = dot(vec_tmp[0], vec_tmp[0]);
340  float len2 = dot(vec_tmp[1], vec_tmp[1]);
341  float len3 = dot(vec_tmp[2], vec_tmp[2]);
342 
343  float mmax[3];
344 
345  unsigned int min_el = 2;
346  unsigned int max_el = 2;
347  if (len1 >= len2 && len1 >= len3)
348  {
349  mmax[2] = len1;
350  evecs[2] = vec_tmp[0] * rsqrtf (len1);
351  }
352  else if (len2 >= len1 && len2 >= len3)
353  {
354  mmax[2] = len2;
355  evecs[2] = vec_tmp[1] * rsqrtf (len2);
356  }
357  else
358  {
359  mmax[2] = len3;
360  evecs[2] = vec_tmp[2] * rsqrtf (len3);
361  }
362 
363  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
364  tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
365 
366  vec_tmp[0] = cross(tmp[0], tmp[1]);
367  vec_tmp[1] = cross(tmp[0], tmp[2]);
368  vec_tmp[2] = cross(tmp[1], tmp[2]);
369 
370  len1 = dot(vec_tmp[0], vec_tmp[0]);
371  len2 = dot(vec_tmp[1], vec_tmp[1]);
372  len3 = dot(vec_tmp[2], vec_tmp[2]);
373 
374  if (len1 >= len2 && len1 >= len3)
375  {
376  mmax[1] = len1;
377  evecs[1] = vec_tmp[0] * rsqrtf (len1);
378  min_el = len1 <= mmax[min_el] ? 1 : min_el;
379  max_el = len1 > mmax[max_el] ? 1 : max_el;
380  }
381  else if (len2 >= len1 && len2 >= len3)
382  {
383  mmax[1] = len2;
384  evecs[1] = vec_tmp[1] * rsqrtf (len2);
385  min_el = len2 <= mmax[min_el] ? 1 : min_el;
386  max_el = len2 > mmax[max_el] ? 1 : max_el;
387  }
388  else
389  {
390  mmax[1] = len3;
391  evecs[1] = vec_tmp[2] * rsqrtf (len3);
392  min_el = len3 <= mmax[min_el] ? 1 : min_el;
393  max_el = len3 > mmax[max_el] ? 1 : max_el;
394  }
395 
396  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
397  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
398 
399  vec_tmp[0] = cross(tmp[0], tmp[1]);
400  vec_tmp[1] = cross(tmp[0], tmp[2]);
401  vec_tmp[2] = cross(tmp[1], tmp[2]);
402 
403  len1 = dot (vec_tmp[0], vec_tmp[0]);
404  len2 = dot (vec_tmp[1], vec_tmp[1]);
405  len3 = dot (vec_tmp[2], vec_tmp[2]);
406 
407 
408  if (len1 >= len2 && len1 >= len3)
409  {
410  mmax[0] = len1;
411  evecs[0] = vec_tmp[0] * rsqrtf (len1);
412  min_el = len3 <= mmax[min_el] ? 0 : min_el;
413  max_el = len3 > mmax[max_el] ? 0 : max_el;
414  }
415  else if (len2 >= len1 && len2 >= len3)
416  {
417  mmax[0] = len2;
418  evecs[0] = vec_tmp[1] * rsqrtf (len2);
419  min_el = len3 <= mmax[min_el] ? 0 : min_el;
420  max_el = len3 > mmax[max_el] ? 0 : max_el;
421  }
422  else
423  {
424  mmax[0] = len3;
425  evecs[0] = vec_tmp[2] * rsqrtf (len3);
426  min_el = len3 <= mmax[min_el] ? 0 : min_el;
427  max_el = len3 > mmax[max_el] ? 0 : max_el;
428  }
429 
430  unsigned mid_el = 3 - min_el - max_el;
431  evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
432  evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
433  }
434  // Rescale back to the original size.
435  evals *= scale;
436  }
437  private:
438  volatile float* mat_pkg;
439 
440  __device__ __forceinline__ float m00() const { return mat_pkg[0]; }
441  __device__ __forceinline__ float m01() const { return mat_pkg[1]; }
442  __device__ __forceinline__ float m02() const { return mat_pkg[2]; }
443  __device__ __forceinline__ float m10() const { return mat_pkg[1]; }
444  __device__ __forceinline__ float m11() const { return mat_pkg[3]; }
445  __device__ __forceinline__ float m12() const { return mat_pkg[4]; }
446  __device__ __forceinline__ float m20() const { return mat_pkg[2]; }
447  __device__ __forceinline__ float m21() const { return mat_pkg[4]; }
448  __device__ __forceinline__ float m22() const { return mat_pkg[5]; }
449 
450  __device__ __forceinline__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
451  __device__ __forceinline__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
452  __device__ __forceinline__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
453 
454  __device__ __forceinline__ static bool isMuchSmallerThan (float x, float y)
455  {
456  // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
457  constexpr float prec_sqr = std::numeric_limits<float>::epsilon() * std::numeric_limits<float>::epsilon();
458  return x * x <= prec_sqr * y * y;
459  }
460  };
461 
462  struct Block
463  {
464  static __device__ __forceinline__ unsigned int stride()
465  {
466  return blockDim.x * blockDim.y * blockDim.z;
467  }
468 
469  static __device__ __forceinline__ int
471  {
472  return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
473  }
474 
475  template<int CTA_SIZE, typename T, class BinOp>
476  static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
477  {
478  int tid = flattenedThreadId();
479  T val = buffer[tid];
480 
481  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
482  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
483  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
484  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
485 
486  if (tid < 32)
487  {
488  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
489  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
490  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
491  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
492  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
493  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
494  }
495  }
496 
497  template<int CTA_SIZE, typename T, class BinOp>
498  static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
499  {
500  int tid = flattenedThreadId();
501  T val = buffer[tid] = init;
502  __syncthreads();
503 
504  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
505  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
506  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
507  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
508 
509  if (tid < 32)
510  {
511  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
512  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
513  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
514  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
515  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
516  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
517  }
518  __syncthreads();
519  return buffer[0];
520  }
521  };
522 
523  struct Warp
524  {
525  enum
526  {
527  LOG_WARP_SIZE = 5,
528  WARP_SIZE = 1 << LOG_WARP_SIZE,
529  STRIDE = WARP_SIZE
530  };
531 
532  /** \brief Returns the warp lane ID of the calling thread. */
533  static __device__ __forceinline__ unsigned int
535  {
536  unsigned int ret;
537  asm("mov.u32 %0, %laneid;" : "=r"(ret) );
538  return ret;
539  }
540 
541  static __device__ __forceinline__ unsigned int id()
542  {
543  int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
544  return tid >> LOG_WARP_SIZE;
545  }
546 
547  static __device__ __forceinline__
549  {
550  unsigned int ret;
551  asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
552  return ret;
553  }
554 
555  static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
556  {
557  return __popc(Warp::laneMaskLt() & ballot_mask);
558  }
559  };
560 
561 
562  struct Emulation
563  {
564  static __device__ __forceinline__ int
565  warp_reduce ( volatile int *ptr , const unsigned int tid)
566  {
567  const unsigned int lane = tid & 31; // index of thread in warp (0..31)
568 
569  if (lane < 16)
570  {
571  int partial = ptr[tid];
572 
573  ptr[tid] = partial = partial + ptr[tid + 16];
574  ptr[tid] = partial = partial + ptr[tid + 8];
575  ptr[tid] = partial = partial + ptr[tid + 4];
576  ptr[tid] = partial = partial + ptr[tid + 2];
577  ptr[tid] = partial = partial + ptr[tid + 1];
578  }
579  return ptr[tid - lane];
580  }
581 
582  static __forceinline__ __device__ int
583  Ballot(int predicate, volatile int* cta_buffer)
584  {
585  #if CUDA_VERSION >= 9000
586  (void)cta_buffer;
587  return __ballot_sync (__activemask (), predicate);
588  #else
589  (void)cta_buffer;
590  return __ballot (predicate);
591  #endif
592  }
593 
594  static __forceinline__ __device__ bool
595  All(int predicate, volatile int* cta_buffer)
596  {
597  #if CUDA_VERSION >= 9000
598  (void)cta_buffer;
599  return __all_sync (__activemask (), predicate);
600  #else
601  (void)cta_buffer;
602  return __all (predicate);
603  #endif
604  }
605  };
606  }
607  }
608 }
609 
610 #endif /* PCL_GPU_KINFU_CUDA_UTILS_HPP_ */
__device__ __forceinline__ float3 normalized(const float3 &v)
Definition: utils.hpp:101
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
Definition: utils.hpp:71
static __device__ __forceinline__ int laneMaskLt()
Definition: utils.hpp:548
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
Definition: utils.hpp:555
__device__ __forceinline__ float norm(const float3 &v)
Definition: utils.hpp:95
This file defines compatibility wrappers for low level I/O functions.
Definition: convolution.h:45
__device__ __host__ __forceinline__ const float3 & operator[](int i) const
Definition: utils.hpp:182
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
Definition: utils.hpp:77
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
Definition: device.hpp:78
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:595
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
Definition: utils.hpp:59
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: utils.hpp:498
__device__ __host__ __forceinline__ void swap(T &a, T &b)
Definition: utils.hpp:53
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
Definition: utils.hpp:107
static __device__ __forceinline__ unsigned int id()
Definition: utils.hpp:541
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
Definition: utils.hpp:83
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
Definition: utils.hpp:126
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)
Definition: utils.hpp:565
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
Definition: utils.hpp:583
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
Definition: utils.hpp:112
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
Definition: utils.hpp:189
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: utils.hpp:476
__device__ __host__ __forceinline__ float3 & operator[](int i)
Definition: utils.hpp:181
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: utils.hpp:534
static __device__ __forceinline__ int flattenedThreadId()
Definition: utils.hpp:470
static __device__ __forceinline__ unsigned int stride()
Definition: utils.hpp:464
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
Definition: utils.hpp:224
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
Definition: utils.hpp:65
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
Definition: utils.hpp:222