Kernels for building and accessing the Random Ball Cover data structure. More...
Classes  
struct  dist_id 
Struct holding a value and a key. More...  
Functions  
void  l1NormMetric (float4 *x, float4 *r, float4 *dists, uint n) 
Computes \( \ell_1 \)norm based distances, \( d(x,y)=\xy\_1=\sum_{i}x_iy_i \). More...  
void  euclideanSquaredMetric (float4 *x, float4 *r, float4 *dists, uint n) 
Computes \( \ell_2 \)norm based squared distances, \( d(x,y)=\xy\^2_2=\sum_{i}(x_iy_i)^{2} \). More...  
void  l1NormMetric8 (float8 *x, float8 *r, float4 *dists, float a, uint n) 
Computes \( \ell_1 \)norm based distances, \( d(x,y)=\frac{1}{1+\alpha}\sum_{i=0}^{3}x_iy_i + \alpha\sum_{i=4}^{7}x_iy_i \). More...  
void  euclideanSquaredMetric8 (float8 *x, float8 *r, float4 *dists, float a, uint n) 
Computes \( \ell_2 \)norm based squared distances, \( d(x,y)=\frac{1}{1+\alpha}\sum_{i=0}^{3}(x_iy_i)^{2} + \alpha\sum_{i=4}^{7}(x_iy_i)^{2} \). More...  
kernel void  rbcComputeDists_SharedNone (global float4 *X, global float4 *R, global float4 *D, uint d) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcComputeDists_SharedR (global float4 *X, global float4 *R, global float4 *D, local float4 *data, uint d) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcComputeDists_SharedXR (global float4 *X, global float4 *R, global float4 *D, local float4 *dataX, local float4 *dataR, uint d) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcComputeDists_Kinect (global float8 *X, global float8 *R, global float4 *D, float a) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcComputeDists_Kinect_R (global float8 *X, global float8 *R, global float4 *D, local float8 *dataR, float a) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcComputeDists_Kinect_XR (global float8 *X, global float8 *R, global float4 *D, local float8 *dataX, local float8 *dataR, float a) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcNInit (global uint4 *N, uint val) 
Performs an array initialization. More...  
kernel void  rbcMinDists (global float4 *D, global dist_id *ID, global uint *N, global uint *Rnk, local dist_id *data, uint n, int accCounters) 
Performs a reduce operation on the columns of an array. More...  
kernel void  rbcGroupMinDists (global dist_id *GM, global dist_id *ID, global uint *N, global uint *Rnk, local dist_id *data, uint n, int accCounters) 
Performs a reduce operation on the columns of an array. More...  
kernel void  rbcPermute (global float4 *X, global dist_id *ID, global float4 *Xp, global dist_id *IDp, global uint *O, global uint *Rnk, int permID) 
Performs a permutation of the RBC database. More...  
kernel void  rbcPermute_Kinect (global float8 *X, global dist_id *ID, global float8 *Xp, global dist_id *IDp, global uint *O, global uint *Rnk, int permID) 
Performs a permutation of the RBC database for the case of Kinect data in \( \mathbb{R}^8 \). More...  
kernel void  rbcComputeQXDists (global float4 *Qp, global float4 *Xp, global float4 *D, global uint *XO, global uint *XN, global dist_id *RID, uint d) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcComputeQXDists_Kinect (global float8 *Qp, global float8 *Xp, global float4 *D, global uint *XO, global uint *XN, global dist_id *RID, float a) 
Computes the distances between two sets of points in a brute force way. More...  
kernel void  rbcGetNNs (global float4 *Xp, global float4 *NN, global uint *XO, global dist_id *RID, global dist_id *NNID) 
Collects the query NNs into an array. More...  
Kernels for building and accessing the Random Ball Cover data structure.

inline 
Computes \( \ell_2 \)norm based squared distances, \( d(x,y)=\xy\^2_2=\sum_{i}(x_iy_i)^{2} \).
Computes a nx4
block of distances between the two input sets.
[in]  x  array of database points holding only 4 of their dimensions. 
[in]  r  array of 4 representative points holding only 4 of their dimensions. 
[out]  dists  array of distances. Each row contains the distances of a database point from all 4 of the representative points. 
[in]  n  number of database points. 

inline 
Computes \( \ell_2 \)norm based squared distances, \( d(x,y)=\frac{1}{1+\alpha}\sum_{i=0}^{3}(x_iy_i)^{2} + \alpha\sum_{i=4}^{7}(x_iy_i)^{2} \).
Computes a nx4
block of distances between the two input sets in \( \mathbb{R}^8 \).
INFINITY
, NAN
values will be produced that will cause problems in later steps.[in]  x  array of database points. 
[in]  r  array of 4 representative points. 
[out]  dists  array of distances. Each row contains the distances of a database point from all 4 of the representative points. 
[in]  a  scaling factor for the result from the high part of the vectors. 
[in]  n  number of database points. 

inline 
Computes \( \ell_1 \)norm based distances, \( d(x,y)=\xy\_1=\sum_{i}x_iy_i \).
Computes a nx4
block of distances between the two input sets.
[in]  x  array of database points holding only 4 of their dimensions. 
[in]  r  array of 4 representative points holding only 4 of their dimensions. 
[out]  dists  array of distances. Each row contains the distances of a database point from all 4 of the representative points. 
[in]  n  number of database points. 

inline 
Computes \( \ell_1 \)norm based distances, \( d(x,y)=\frac{1}{1+\alpha}\sum_{i=0}^{3}x_iy_i + \alpha\sum_{i=4}^{7}x_iy_i \).
Computes a nx4
block of distances between the two input sets in \( \mathbb{R}^8 \).
INFINITY
, NAN
values will be produced that will cause problems in later steps.[in]  x  array of database points. 
[in]  r  array of 4 representative points. 
[out]  dists  array of distances. Each row contains the distances of a database point from all 4 of the representative points. 
[in]  a  scaling factor for the result from the high part of the vectors. 
[in]  n  number of database points. 
kernel void rbcComputeDists_Kinect  (  global float8 *  X, 
global float8 *  R,  
global float4 *  D,  
float  a  
) 
Computes the distances between two sets of points in a brute force way.
For every point in the first set, the distances from that point to all points in the second set are computed.
float
. rbcComputeDists_SharedNone
for the case of Kinect 8D data [geometric (Homogeneous coordinates) and photometric (RGBA values) information]
.[in]  X  array of database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  R  array of representative points (each row contains a point), \( R_{n_r \times d} \). 
[out]  D  array of distances of the database points from the representative points (each row contains the distances of a database point from all the representative points), \( D_{n_x \times n_r} \). 
[in]  a  scaling factor multiplying the result of the distance calculation for the high part of the points (float8 vectors). 
kernel void rbcComputeDists_Kinect_R  (  global float8 *  X, 
global float8 *  R,  
global float4 *  D,  
local float8 *  dataR,  
float  a  
) 
Computes the distances between two sets of points in a brute force way.
For every point in the first set, the distances from that point to all points in the second set are computed.
float
. rbcComputeDists_SharedR
for the case of Kinect 8D data [geometric (Homogeneous coordinates) and photometric (RGBA values) information]
. The kernel uses shared memory for staging blocks of data from R.[in]  X  array of the database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  R  array of the representative points (each row contains a point), \( R_{n_r \times d} \). 
[out]  D  array of distances of the database points from the representative points (each row contains the distances of a database point from all the representative points), \( D_{n_x \times n_r} \). 
[in]  dataR  local buffer. Its size should be 16 float elements for each workitem in a workgroup. That is, \( 4*lXdim*sizeof\ (float8) \). 
[in]  a  scaling factor multiplying the result of the distance calculation for the high part of the points (float8 vectors). 
kernel void rbcComputeDists_Kinect_XR  (  global float8 *  X, 
global float8 *  R,  
global float4 *  D,  
local float8 *  dataX,  
local float8 *  dataR,  
float  a  
) 
Computes the distances between two sets of points in a brute force way.
For every point in the first set, the distances from that point to all points in the second set are computed.
float
. rbcComputeDists_SharedXR
for the case of Kinect 8D data [geometric (Homogeneous coordinates) and photometric (RGBA values) information]
. The kernel uses shared memory for staging blocks of data from both X and R.[in]  X  array of the database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  R  array of the representative points (each row contains a point), \( R_{n_r \times d} \). 
[out]  D  array of distances of the database points from the representative points (each row contains the distances of a database point from all the representative points), \( D_{n_x \times n_r} \). 
[in]  dataX  local buffer. Its size should be 16 float elements for each workitem in a workgroup. That is, \( 4*lYdim*sizeof\ (float8) \). 
[in]  dataR  local buffer. Its size should be 16 float elements for each workitem in a workgroup. That is, \( 4*lXdim*sizeof\ (float8) \). 
[in]  a  scaling factor multiplying the result of the distance calculation for the high part of the points (float8 vectors). 
kernel void rbcComputeDists_SharedNone  (  global float4 *  X, 
global float4 *  R,  
global float4 *  D,  
uint  d  
) 
Computes the distances between two sets of points in a brute force way.
For every point in the first set, the distances from that point to all points in the second set are computed.
float
. rbcComputeDists_SharedR
and rbcComputeDists_SharedXR
. The access pattern for both arrays is strided, but the kernel's performance might actually be better than the other two cases', due to higher kernel occupancy.[in]  X  array of database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  R  array of representative points (each row contains a point), \( R_{n_r \times d} \). 
[out]  D  array of distances of the database points from the representative points (each row contains the distances of a database point from all the representative points), \( D_{n_x \times n_r} \). 
[in]  d  dimensionality of the associated points. 
kernel void rbcComputeDists_SharedR  (  global float4 *  X, 
global float4 *  R,  
global float4 *  D,  
local float4 *  data,  
uint  d  
) 
Computes the distances between two sets of points in a brute force way.
For every point in the first set, the distances from that point to all points in the second set are computed.
float
. rbcComputeDists_SharedNone
and rbcComputeDists_SharedXR
. The kernel might have better performance than the last case, due to higher kernel occupancy.[in]  X  array of database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  R  array of representative points (each row contains a point), \( R_{n_r \times d} \). 
[out]  D  array of distances of the database points from the representative points (each row contains the distances of a database point from all the representative points), \( D_{n_x \times n_r} \). 
[in]  data  local buffer. Its size should be 16 float elements for each workitem in a workgroup. That is, \( (4*lXdim)*(4*lYdim)*sizeof\ (float) \). 
[in]  d  dimensionality of the associated points. 
kernel void rbcComputeDists_SharedXR  (  global float4 *  X, 
global float4 *  R,  
global float4 *  D,  
local float4 *  dataX,  
local float4 *  dataR,  
uint  d  
) 
Computes the distances between two sets of points in a brute force way.
For every point in the first set, the distances from that point to all points in the second set are computed.
float
. rbcComputeDists_SharedNone
and rbcComputeDists_SharedR
. The kernel might have worse performance than those cases, due to high use of VGPRs and LDS, resulting in low kernel occupancy.[in]  X  array of database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  R  array of representative points (each row contains a point), \( R_{n_r \times d} \). 
[out]  D  array of distances of the database points from the representative points (each row contains the distances of a database point from all the representative points), \( D_{n_x \times n_r} \). 
[in]  dataX  local buffer. Its size should be 16 float elements for each workitem in a workgroup. That is, \( (4*lXdim)*(4*lYdim)*sizeof\ (float) \). 
[in]  dataR  local buffer. Its size should be 16 float elements for each workitem in a workgroup. That is, \( (4*lXdim)*(4*lYdim)*sizeof\ (float) \). 
[in]  d  dimensionality of the associated points. 
kernel void rbcComputeQXDists  (  global float4 *  Qp, 
global float4 *  Xp,  
global float4 *  D,  
global uint *  XO,  
global uint *  XN,  
global dist_id *  RID,  
uint  d  
) 
Computes the distances between two sets of points in a brute force way.
Computes the distances from the queries to the points in their representative's list.
float
. [in]  Qp  array of query points (each row contains a point), \( Q_{n_q \times d} \). 
[in]  Xp  array of database points (each row contains a point), \( Xp_{n_x \times d} \). 
[out]  D  array of distances from the query points to the database points in each query's representative list (each row contains the distances of a query point from all the points), \( D_{n_q \times max_{r_{id}}{N[r_{id}]}} \). 
[in]  XO  array containing the offsets of the representative lists within the permuted database. Its size should be \( n_r*sizeof\ (uint) \). 
[in]  XN  array containing the cardinalities of the representative lists. Its size should be \( n_r*sizeof\ (uint) \). 
[in]  RID  array with the representative ids for each query in Qp. Its size should be \( n_q*sizeof\ (dist\_id) \). 
[in]  d  dimensionality of the associated points. 
kernel void rbcComputeQXDists_Kinect  (  global float8 *  Qp, 
global float8 *  Xp,  
global float4 *  D,  
global uint *  XO,  
global uint *  XN,  
global dist_id *  RID,  
float  a  
) 
Computes the distances between two sets of points in a brute force way.
Computes the distances from the queries to the points in their representative's list.
float
. [in]  Qp  array of query points (each row contains a point), \( Q_{n_q \times d} \). 
[in]  Xp  array of database points (each row contains a point), \( Xp_{n_x \times d} \). 
[out]  D  array of distances from the query points to the database points in each query's representative list (each row contains the distances of a query point from all the points), \( D_{n_q \times max_{r_{id}}{N[r_{id}]}} \). 
[in]  XO  array containing the offsets of the representative lists within the permuted database. Its size should be \( n_r*sizeof\ (uint) \). 
[in]  XN  array containing the cardinalities of the representative lists. Its size should be \( n_r*sizeof\ (uint) \). 
[in]  RID  array with the representative ids for each query in Qp. Its size should be \( n_q*sizeof\ (dist\_id) \). 
[in]  a  scaling factor multiplying the result of the distance calculation for the high part of the points (float8 vectors). 
kernel void rbcGetNNs  (  global float4 *  Xp, 
global float4 *  NN,  
global uint *  XO,  
global dist_id *  RID,  
global dist_id *  NNID  
) 
Collects the query NNs into an array.
d
, should be a multiple of 4 (the data are handled as float4
). The x dimension of the global workspace, \( gXdim \), should be equal to the dimensionality of the points divided by 4. That is, \( \ gXdim=d/4 \). The y dimension of the global workspace, \( gYdim \), should be equal to the number of queries, \( n_q \). That is, \( \ gYdim = n_q \). There is no requirement for the local workspace.[in]  Xp  permuted array of database points (each row contains a point), \( X_{n_x \times d} \). 
[out]  NN  array of queries' nearest neighbors (each row contains a point), \( X_{n_q \times d} \). 
[in]  XO  array containing the offsets of the representative lists within the permuted database. Its size should be \( n_r*sizeof\ (uint) \). 
[in]  RID  array with the representative ids for each query in Qp. Its size should be \( n_q*sizeof\ (dist\_id) \). 
[in]  NNID  array with the minimum distances and NN ids per query. Its size should be \( n_q*sizeof\ (dist\_id) \). The NN ids index the points in the associated representative list. 
kernel void rbcGroupMinDists  (  global dist_id *  GM, 
global dist_id *  ID,  
global uint *  N,  
global uint *  Rnk,  
local dist_id *  data,  
uint  n,  
int  accCounters  
) 
Performs a reduce operation on the columns of an array.
Computes the minimum element, and its corresponding column id, for each row in an array. It also builds a histogram of the id values. And lastly, it stores the rank (order of insert) of each minimum element within its corresponding histogram bin.
2 dist_id
elements in a row of the array. The y dimension of the global workspace, \( gYdim \), should be equal to the number of rows, m
, in the array. That is, \( \ gYdim = m \). The local workspace should be 1
in the y dimension, and a power of 2 in the x dimension. It is recommended to use one wavefront/warp
per workgroup. N
counters. rbcNListInit
should be called, before rbcGroupMinDists
is dispatched, in order to initialize the counters.[in]  GM  input array of dist_id elements. 
[out]  ID  (reduced) output array of dist_id elements. When the kernel is dispatched with one workgroup per row, the array contains the final results, and its size should be \( m*sizeof\ (dist\_id) \). When the kernel is dispatched with more than one workgroup per row, the array contains the results from each block reduction, and its size should be \( wgXdim*m*sizeof\ (dist\_id) \). 
[in]  data  local buffer. Its size should be 2 dist_id elements for each workitem in a workgroup. That is, \( 2*lXdim*sizeof\ (dist\_id) \). 
[out]  N  array containing the cardinalities of the representative lists. Its size should be \( n*sizeof\ (uint) \). 
[out]  Rnk  array containing the rank (aka order, index) of each database point within the associated representative list. Its size should be \( m*sizeof\ (uint) \). 
[in]  n  number of elements in a row of the array. 
[in]  accCounters  flag to indicate whether or not to involve in the computation the list element counters, N , and element ranks, Rnk . 
kernel void rbcMinDists  (  global float4 *  D, 
global dist_id *  ID,  
global uint *  N,  
global uint *  Rnk,  
local dist_id *  data,  
uint  n,  
int  accCounters  
) 
Performs a reduce operation on the columns of an array.
Computes the minimum element, and its corresponding column id, for each row in an array. It also builds a histogram of the id values. And lastly, it stores the rank (order of insert) of each minimum element within its corresponding histogram bin.
n
, in a row of the array should be a multiple of 4 (the data are handled as float4
). The x dimension of the global workspace, \( gXdim \), should be greater or equal to the number of elements in a row of the array divided by 8. That is, \( \ gXdim \geq n/8 \). Each workitem handles 8 float
(= 2 float4
) elements in a row of the array. The y dimension of the global workspace, \( gYdim \), should be equal to the number of rows, m
, in the array. That is, \( \ gYdim = m \). The local workspace should be 1
in the y dimension, and a power of 2 in the x dimension. It is recommended to use one wavefront/warp
per workgroup. ID
, will contain the true minimums. When the elements are more than that, they are partitioned into blocks and reduced independently. In this case, the kernel outputs the minimums from each block reduction. A reduction should then be made on those minimums for the final results by dispatching the rbcGroupMinDists
kernel. N
counters. rbcNListInit
should be called, before rbcMinDists
is dispatched, in order to initialize the counters.[in]  D  input array of float elements. 
[out]  ID  (reduced) output array of dist_id elements. When the kernel is dispatched with one workgroup per row, the array contains the final results, and its size should be \( m*sizeof\ (dist\_id) \). When the kernel is dispatched with more than one workgroup per row, the array contains the results from each block reduction, and its size should be \( wgXdim*m*sizeof\ (dist\_id) \). 
[in]  data  local buffer. Its size should be 2 dist_id elements for each workitem in a workgroup. That is, \( 2*lXdim*sizeof\ (dist\_id) \). 
[out]  N  array containing the cardinalities of the representative lists. Its size should be \( n*sizeof\ (uint) \). 
[out]  Rnk  array containing the rank (aka order, index) of each database point within the associated representative list. Its size should be \( m*sizeof\ (uint) \). 
[in]  n  number of elements in a row of the array divided by 4. 
[in]  accCounters  flag to indicate whether or not to involve in the computation the list element counters, N , and element ranks, Rnk . 
kernel void rbcNInit  (  global uint4 *  N, 
uint  val  
) 
Performs an array initialization.
Initializes an 1D array with a provided value.
n
, in the array should be a multiple of 4 (the data are handled as uint4
). The global workspace should be one dimensional. The x dimension of the global workspace, \( gXdim \), should be equal to the number of elements in the array divided by 4. That is, \( \ gXdim=n/4 \). There is no requirement for the local workspace.[out]  N  array that is going to contain the cardinalities of the representative lists. Its size should be \( n*sizeof\ (uint) \). 
[in]  val  initialization value. 
kernel void rbcPermute  (  global float4 *  X, 
global dist_id *  ID,  
global float4 *  Xp,  
global dist_id *  IDp,  
global uint *  O,  
global uint *  Rnk,  
int  permID  
) 
Performs a permutation of the RBC
database.
Permutes the database points to form the representative lists and allow for coalesced access pattern during the search operation.
d
, should be a multiple of 4 (the data are handled as float4
). The x dimension of the global workspace, \( gXdim \), should be equal to the dimensionality of the points divided by 4. That is, \( \ gXdim=d/4 \). The y dimension of the global workspace, \( gYdim \), should be equal to the number of database points, \( n_x \). That is, \( \ gYdim = n_x \). There is no requirement for the local workspace.[in]  X  array of database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  ID  array with the minimum distances and representative ids per database point in X. Its size should be \( n_x*sizeof\ (dist\_id) \). 
[out]  Xp  permuted array of database points (each row contains a point), \( X_{n_x \times d} \). 
[out]  IDp  array with the minimum distances and representative ids per database point in Xp. Its size should be \( n_x*sizeof\ (dist\_id) \). 
[in]  O  array containing the offsets of the representative lists within the permuted database. Its size should be \( n_r*sizeof\ (uint) \). 
[in]  Rnk  array containing the rank (aka order, index) of each database point within the associated representative list. Its size should be \( n_x*sizeof\ (uint) \). 
[in]  permID  flag to indicate whether or not to also permute the ID array. 
kernel void rbcPermute_Kinect  (  global float8 *  X, 
global dist_id *  ID,  
global float8 *  Xp,  
global dist_id *  IDp,  
global uint *  O,  
global uint *  Rnk,  
int  permID  
) 
Performs a permutation of the RBC
database for the case of Kinect data in \( \mathbb{R}^8 \).
Permutes the database points to form the representative lists and allow for coalesced access pattern during the search operation.
[in]  X  array of database points (each row contains a point), \( X_{n_x \times d} \). 
[in]  ID  array with the minimum distances and representative ids per database point in X. Its size should be \( n_x*sizeof\ (dist\_id) \). 
[out]  Xp  permuted array of database points (each row contains a point), \( X_{n_x \times d} \). 
[out]  IDp  array with the minimum distances and representative ids per database point in Xp. Its size should be \( n_x*sizeof\ (dist\_id) \). 
[in]  O  array containing the offsets of the representative lists within the permuted database. Its size should be \( n_r*sizeof\ (uint) \). 
[in]  Rnk  array containing the rank (aka order, index) of each database point within the associated representative list. Its size should be \( n_x*sizeof\ (uint) \). 
[in]  permID  flag to indicate whether or not to also permute the ID array. 