diff options
author | Luca Deri <lucaderi@users.noreply.github.com> | 2024-09-10 16:22:06 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-09-10 16:22:06 +0200 |
commit | 7fdc4b2472baec0ba0927f861a286ed39ac1c684 (patch) | |
tree | fcba54c19d799559433b48b1edbe2a68e366281e /src/lib | |
parent | f4d2002ce93f1129d5ebf844bad55edfb72216b7 (diff) |
Implemented algorithms for K-Nearest Neighbor Search (KNN) (#2554)
* Extended API with functions for vector similarity based on KD-trees https://en.wikipedia.org/wiki/K-d_tree
ndpi_kd_tree* ndpi_kd_create(u_int num_dimensions);
void ndpi_kd_free(ndpi_kd_tree *tree);
void ndpi_kd_clear(ndpi_kd_tree *tree);
bool ndpi_kd_insert(ndpi_kd_tree *tree, const double *data_vector, void *user_data);
ndpi_kd_tree_result *ndpi_kd_nearest(ndpi_kd_tree *tree, const double *data_vector);
u_int32_t ndpi_kd_num_results(ndpi_kd_tree_result *res);
bool ndpi_kd_result_end(ndpi_kd_tree_result *res);
double* ndpi_kd_result_get_item(ndpi_kd_tree_result *res, double **user_data);
bool ndpi_kd_result_next(ndpi_kd_tree_result *res);
void ndpi_kd_result_free(ndpi_kd_tree_result *res);
double ndpi_kd_distance(double *a1, double *b2, u_int num_dimensions);
Diffstat (limited to 'src/lib')
-rw-r--r-- | src/lib/ndpi_analyze.c | 67 | ||||
-rw-r--r-- | src/lib/third_party/include/ball.h | 107 | ||||
-rw-r--r-- | src/lib/third_party/include/kdtree.h | 121 | ||||
-rw-r--r-- | src/lib/third_party/src/ball.c | 573 | ||||
-rw-r--r-- | src/lib/third_party/src/kdtree.c | 664 |
5 files changed, 1530 insertions, 2 deletions
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index 519058a54..c6132e429 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -31,7 +31,8 @@ #include "ndpi_api.h" #include "ndpi_config.h" #include "third_party/include/hll.h" - +#include "third_party/include/kdtree.h" +#include "third_party/include/ball.h" #include "ndpi_replace_printf.h" /* ********************************************************************************* */ @@ -1916,7 +1917,7 @@ u_int32_t ndpi_crc32(const void *data, size_t length, u_int32_t crc) { const u_int8_t *p = (const u_int8_t*)data; crc = ~crc; - + while (length--) { crc = crc32_ieee_table[(crc ^ *p++) & 0xFF] ^ (crc >> 8); @@ -2078,3 +2079,65 @@ void ndpi_popcount_count(struct ndpi_popcount *h, const u_int8_t *buf, u_int32_t h->tot_bytes_count += buf_len; } + +/* ********************************************************************************* */ +/* ********************************************************************************* */ + +ndpi_kd_tree* ndpi_kd_create(u_int num_dimensions) { return(kd_create((int)num_dimensions)); } + +void ndpi_kd_free(ndpi_kd_tree *tree) { kd_free((struct kdtree *)tree); } + +void ndpi_kd_clear(ndpi_kd_tree *tree) { kd_clear((struct kdtree *)tree); } + +bool ndpi_kd_insert(ndpi_kd_tree *tree, const double *data_vector, void *user_data) { + return(kd_insert((struct kdtree *)tree, data_vector, user_data) == 0 ? true : false); +} + +ndpi_kd_tree_result *ndpi_kd_nearest(ndpi_kd_tree *tree, const double *data_vector) { + return(kd_nearest((struct kdtree *)tree, data_vector)); +} + +u_int32_t ndpi_kd_num_results(ndpi_kd_tree_result *res) { return((u_int32_t)kd_res_size((struct kdres*)res)); } + +double* ndpi_kd_result_get_item(ndpi_kd_tree_result *res, double **user_data) { + return(kd_res_item((struct kdres*)res, user_data)); +} + +void ndpi_kd_result_free(ndpi_kd_tree_result *res) { kd_res_free((struct kdres *)res); } + +double ndpi_kd_distance(double *a1, double *a2, u_int num_dimensions) { + double dist_sq = 0, diff; + u_int i; + + for(i=0; i<num_dimensions; i++) { + diff = a1[i] - a2[i]; + +#if 0 + if(diff != 0) { + printf("Difference %.3f at position %u\n", diff, pos); + } +#endif + dist_sq += diff*diff; + } + + return(dist_sq); +} + +/* ********************************************************************************* */ +/* ********************************************************************************* */ + +ndpi_btree* ndpi_btree_init(double **data, u_int32_t n_rows, u_int32_t n_columns) { + return((ndpi_btree*)btree_init(data, (int)n_rows, (int)n_columns, 30)); +} + +ndpi_knn ndpi_btree_query(ndpi_btree *b, double **query_data, + u_int32_t query_data_num_rows, u_int32_t query_data_num_columns, + u_int32_t max_num_results) { + return(btree_query((t_btree*)b, query_data, (int)query_data_num_rows, + (int)query_data_num_columns, (int)max_num_results)); +} + +void ndpi_free_knn(ndpi_knn knn) { free_knn(knn, knn.n_samples); } + +void ndpi_free_btree(ndpi_btree *b) { free_tree((t_btree*)b); } + diff --git a/src/lib/third_party/include/ball.h b/src/lib/third_party/include/ball.h new file mode 100644 index 000000000..72d9793e3 --- /dev/null +++ b/src/lib/third_party/include/ball.h @@ -0,0 +1,107 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* ball.h :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: elee <elee@student.42.us.org> +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2017/06/28 10:55:27 by elee #+# #+# */ +/* Updated: 2017/06/28 20:56:18 by elee ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#ifndef BALL_H +# define BALL_H + +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +# include <limits.h> +# include <math.h> +# include <float.h> +# include <sys/stat.h> +# include <time.h> + +# define TRUE 1 +# define FALSE 0 + +typedef struct s_data +{ + int n_neighbors; + int leaf_size; + double **data; + int n_samples; + int n_features; +} t_data; + +typedef struct s_nheap +{ + double **distances; + int **indices; + int n_pts; + int n_nbrs; +} t_nheap; + +#if 0 +/* See ndpi_typedefs.h */ +typedef struct s_knn +{ + double **distances; + int **indices; + int n_samples; + int n_neighbors; +} ndpi_knn; +#endif + +typedef struct s_nodedata +{ + int idx_start; + int idx_end; + int is_leaf; + double radius; +} t_nodedata; + +typedef struct s_btree +{ + double **data; + int *idx_array; + t_nodedata *node_data; + double ***node_bounds; + + int n_samples; + int n_features; + + int leaf_size; + int n_levels; + int n_nodes; +} t_btree; + + +/* +** metrics.c +*/ + +double manhattan_dist(double *x1, double *x2, int size); +double min_dist(t_btree *tree, int i_node, double *pt); + +/* +** neighbors_heap.c +*/ + +t_nheap *nheap_init(int n_pts, int n_nbrs); +double nheap_largest(t_nheap *h, int row); +int nheap_push(t_nheap *h, int row, double val, int i_val); +ndpi_knn nheap_get_arrays(t_nheap *h); + +/* +** ball.c +*/ + +t_btree *btree_init(double **data, int n_samples, int n_features, int leaf_size); +ndpi_knn btree_query(t_btree *b, double **x, int n_samples, int n_features, int k); +void free_2d_double(double **arr, int row); +void free_2d_int(int **arr, int row); +void free_tree(t_btree *tree); +void free_knn(ndpi_knn knn, int row); + +#endif diff --git a/src/lib/third_party/include/kdtree.h b/src/lib/third_party/include/kdtree.h new file mode 100644 index 000000000..e51e4ed2e --- /dev/null +++ b/src/lib/third_party/include/kdtree.h @@ -0,0 +1,121 @@ +/* + This file is part of ``kdtree'', a library for working with kd-trees. + Copyright (C) 2007-2011 John Tsiombikas <nuclear@member.fsf.org> + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + OF SUCH DAMAGE. +*/ +#ifndef _KDTREE_H_ +#define _KDTREE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + + struct kdtree; + struct kdres; + + + /* create a kd-tree for "k"-dimensional data */ + struct kdtree *kd_create(int k); + + /* free the struct kdtree */ + void kd_free(struct kdtree *tree); + + /* remove all the elements from the tree */ + void kd_clear(struct kdtree *tree); + + /* if called with non-null 2nd argument, the function provided + * will be called on data pointers (see kd_insert) when nodes + * are to be removed from the tree. + */ + void kd_data_destructor(struct kdtree *tree, void (*destr)(void*)); + + /* insert a node, specifying its position, and optional data */ + int kd_insert(struct kdtree *tree, const double *pos, void *data); + int kd_insertf(struct kdtree *tree, const float *pos, void *data); + int kd_insert3(struct kdtree *tree, double x, double y, double z, void *data); + int kd_insert3f(struct kdtree *tree, float x, float y, float z, void *data); + + /* Find the nearest node from a given point. + * + * This function returns a pointer to a result set with at most one element. + */ + struct kdres *kd_nearest(struct kdtree *tree, const double *pos); + struct kdres *kd_nearestf(struct kdtree *tree, const float *pos); + struct kdres *kd_nearest3(struct kdtree *tree, double x, double y, double z); + struct kdres *kd_nearest3f(struct kdtree *tree, float x, float y, float z); + + /* Find the N nearest nodes from a given point. + * + * This function returns a pointer to a result set, with at most N elements, + * which can be manipulated with the kd_res_* functions. + * The returned pointer can be null as an indication of an error. Otherwise + * a valid result set is always returned which may contain 0 or more elements. + * The result set must be deallocated with kd_res_free after use. + */ + /* + struct kdres *kd_nearest_n(struct kdtree *tree, const double *pos, int num); + struct kdres *kd_nearest_nf(struct kdtree *tree, const float *pos, int num); + struct kdres *kd_nearest_n3(struct kdtree *tree, double x, double y, double z); + struct kdres *kd_nearest_n3f(struct kdtree *tree, float x, float y, float z); + */ + + /* Find any nearest nodes from a given point within a range. + * + * This function returns a pointer to a result set, which can be manipulated + * by the kd_res_* functions. + * The returned pointer can be null as an indication of an error. Otherwise + * a valid result set is always returned which may contain 0 or more elements. + * The result set must be deallocated with kd_res_free after use. + */ + struct kdres *kd_nearest_range(struct kdtree *tree, const double *pos, double range); + + /* frees a result set returned by kd_nearest_range() */ + void kd_res_free(struct kdres *set); + + /* returns the size of the result set (in elements) */ + int kd_res_size(struct kdres *set); + + /* rewinds the result set iterator */ + void kd_res_rewind(struct kdres *set); + + /* returns non-zero if the set iterator reached the end after the last element */ + int kd_res_end(struct kdres *set); + + /* advances the result set iterator, returns non-zero on success, zero if + * there are no more elements in the result set. + */ + int kd_res_next(struct kdres *set); + + /* returns the data pointer (can be null) of the current result set item + * and optionally sets its position to the pointers(s) if not null. + */ + double *kd_res_item(struct kdres *set, double **pos); /* ntop */ + + /* equivalent to kd_res_item(set, 0) */ + void *kd_res_item_data(struct kdres *set); +#ifdef __cplusplus +} +#endif + +#endif /* _KDTREE_H_ */ diff --git a/src/lib/third_party/src/ball.c b/src/lib/third_party/src/ball.c new file mode 100644 index 000000000..b80ce9d13 --- /dev/null +++ b/src/lib/third_party/src/ball.c @@ -0,0 +1,573 @@ +/* + + https://github.com/leb9212/BallTree + + MIT License + + Copyright (c) 2017 Eung Bum Lee + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + https://varshasaini.in/kd-tree-and-ball-tree-knn-algorithm/ + +*/ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* ball.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: elee <elee@student.42.us.org> +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2017/06/28 10:45:02 by elee #+# #+# */ +/* Updated: 2017/06/28 20:56:01 by elee ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "ndpi_api.h" +#include "ball.h" + +double **copy_double_arr(double **arr, int row, int col) +{ + double **copy; + int i, j; + + copy = (double**)ndpi_malloc(sizeof(double*) * row); + for (i = 0; i < row; i++) + { + copy[i] = (double*)ndpi_malloc(sizeof(double) * col); + for (j = 0; j < col; j++) + copy[i][j] = arr[i][j]; + } + return (copy); +} + +void swap(int *arr, int i1, int i2) +{ + int tmp; + + tmp = arr[i1]; + arr[i1] = arr[i2]; + arr[i2] = tmp; +} + +void btree_zero(t_btree *b) +{ + b->data = NULL; + b->idx_array = NULL; + b->node_data = NULL; + b->node_bounds = NULL; + + b->leaf_size = 40; + b->n_levels = 0; + b->n_nodes = 0; +} + +int init_node(t_btree *b, int i_node, int idx_start, int idx_end) +{ + int n_features; + int n_points; + int i, j; + double radius; + double *centroid; + + n_features = b->n_features; + n_points = idx_end - idx_start; + centroid = b->node_bounds[0][i_node]; + + for (j = 0; j < n_features; j++) + centroid[j] = 0.0; + + for (i = idx_start; i < idx_end; i++) + for (j = 0; j < n_features; j++) + centroid[j] += b->data[b->idx_array[i]][j]; + + for (j = 0; j < n_features; j++) + centroid[j] /= n_points; + + radius = 0.0; + for (i = idx_start; i < idx_end; i++) + radius = fmax(radius, manhattan_dist(centroid, b->data[b->idx_array[i]], n_features)); + + b->node_data[i_node].radius = radius; + b->node_data[i_node].idx_start = idx_start; + b->node_data[i_node].idx_end = idx_end; + return (0); +} + +int find_node_split_dim(double **data, int *node_indices, int n_features, int n_points) +{ + double min_val, max_val, val, spread, max_spread; + int i, j, j_max; + + j_max = 0; + max_spread = 0; + for (j = 0; j < n_features; j++) + { + max_val = data[node_indices[0]][j]; + min_val = max_val; + for (i = 1; i < n_points; i++) + { + val = data[node_indices[i]][j]; + max_val = fmax(max_val, val); + min_val = fmin(min_val, val); + } + spread = max_val - min_val; + if (spread > max_spread) + { + max_spread = spread; + j_max = j; + } + } + return (j_max); +} + +int partition_node_indices(double **data, int *node_indices, int split_dim, int split_index, + int n_features, int n_points) +{ + (void)n_features; + int left, right, midindex, i; + double d1, d2; + + left = 0; + right = n_points - 1; + + while (TRUE) + { + midindex = left; + for (i = left; i < right; i++) + { + d1 = data[node_indices[i]][split_dim]; + d2 = data[node_indices[right]][split_dim]; + if (d1 < d2) + { + swap(node_indices, i, midindex); + midindex += 1; + } + } + swap(node_indices, midindex, right); + if (midindex == split_index) + break ; + else if (midindex < split_index) + left = midindex + 1; + else + right = midindex - 1; + } + return (0); +} + +void recursive_build(t_btree *b, int i_node, int idx_start, int idx_end) +{ + int imax; + int n_features; + int n_points; + int n_mid; + + n_features = b->n_features; + n_points = idx_end - idx_start; + n_mid = n_points / 2; + + //initialize the node data + init_node(b, i_node, idx_start, idx_end); + + if (2 * i_node + 1 >= b->n_nodes) + { + b->node_data[i_node].is_leaf = TRUE; + /* + if (idx_end - idx_start > 2 * b->leaf_size) + printf("Memory layout is flawed: not enough nodes allocated"); + */ + } + else if (idx_end - idx_start < 2) + { + /* printf("Memory layout is flawed: too many nodes allocated"); */ + b->node_data[i_node].is_leaf = TRUE; + } + else + { + b->node_data[i_node].is_leaf = FALSE; + imax = find_node_split_dim(b->data, b->idx_array, n_features, n_points); + partition_node_indices(b->data, b->idx_array, imax, n_mid, n_features, n_points); + recursive_build(b, 2 * i_node + 1, idx_start, idx_start + n_mid); + recursive_build(b, 2 * i_node + 2, idx_start + n_mid, idx_end); + } +} + +t_btree *btree_init(double **data, int n_samples, int n_features, int leaf_size) +{ + t_btree *b; + int i, j; + + b = (t_btree*)ndpi_malloc(sizeof(t_btree)); + btree_zero(b); + + b->data = copy_double_arr(data, n_samples, n_features); + b->leaf_size = leaf_size; + + if (leaf_size < 1) + { + /* printf("leaf_size must be greater than or equal to 1\n"); */ + return(NULL); + } + + b->n_samples = n_samples; + b->n_features = n_features; + + b->n_levels = log2(fmax(1, (b->n_samples - 1) / b->leaf_size)) + 1; + b->n_nodes = pow(2.0, b->n_levels) - 1; + + b->idx_array = (int*)ndpi_malloc(sizeof(int) * b->n_samples); + for (i = 0; i < b->n_samples; i++) + b->idx_array[i] = i; + b->node_data = (t_nodedata*)ndpi_calloc(b->n_nodes, sizeof(t_nodedata)); + b->node_bounds = (double***)ndpi_malloc(sizeof(double**)); + b->node_bounds[0] = (double**)ndpi_malloc(sizeof(double*) * b->n_nodes); + for (i = 0; i < b->n_nodes; i++) + { + b->node_bounds[0][i] = (double*)ndpi_malloc(sizeof(double) * b->n_features); + for (j = 0; j < b->n_features; j++) + b->node_bounds[0][i][j] = 0.0; + } + recursive_build(b, 0, 0, b->n_samples); + return (b); +} + +int query_depth_first(t_btree *b, int i_node, double *pt, int i_pt, t_nheap *heap, double dist) +{ + t_nodedata node_info = b->node_data[i_node]; + double dist_pt, dist1, dist2; + int i, i1, i2; + + //case 1: query point is outside node radius: trim it from the query + if (dist > nheap_largest(heap, i_pt)) + { + ; + } + //case 2: this is a leaf node. Update set of nearby points + else if (node_info.is_leaf) + { + for (i = node_info.idx_start; i < node_info.idx_end; i++) + { + dist_pt = manhattan_dist(pt, b->data[b->idx_array[i]], b->n_features); + if (dist_pt < nheap_largest(heap, i_pt)) + nheap_push(heap, i_pt, dist_pt, b->idx_array[i]); + } + } + //case 3: Node is not a leaf, Recursively query sub-nodes starting with the closest + else + { + i1 = 2 * i_node +1; + i2 = i1 +1; + dist1 = min_dist(b, i1, pt); //implement min_rdist + dist2 = min_dist(b, i2, pt); + if (dist1 <= dist2) + { + query_depth_first(b, i1, pt, i_pt, heap, dist1); + query_depth_first(b, i2, pt, i_pt, heap, dist2); + } + else + { + query_depth_first(b, i2, pt, i_pt, heap, dist2); + query_depth_first(b, i1, pt, i_pt, heap, dist1); + } + } + return (0); +} + +ndpi_knn btree_query(t_btree *b, double **x, int n_samples, int n_features, int k) +{ + t_nheap *heap; + double dist; + int i; + ndpi_knn output; + + memset(&output, 0, sizeof(output)); + + if (n_features != b->n_features) + { + /* printf("query data dimension must match training data dimension.\n"); */ + return (output); + } + if (b->n_samples < k) + { + /* printf("k must be less than or equal to the number of training points.\n"); */ + return (output); + } + heap = nheap_init(n_samples, k); + for (i = 0; i < n_samples; i++) + { + dist = min_dist(b, 0, x[i]); + query_depth_first(b, 0, x[i], i, heap, dist); + } + output = nheap_get_arrays(heap); + return (output); +} + +void free_2d_double(double **arr, int row) +{ + int i; + + for (i = 0; i < row; i++) + ndpi_free(arr[i]); + ndpi_free(arr); +} + +void free_2d_int(int **arr, int row) +{ + int i; + + for (i = 0; i < row; i++) + ndpi_free(arr[i]); + ndpi_free(arr); +} + +void free_tree(t_btree *tree) +{ + free_2d_double(tree->data, tree->n_samples); + ndpi_free(tree->idx_array); + ndpi_free(tree->node_data); + free_2d_double(tree->node_bounds[0], tree->n_nodes); + ndpi_free(tree->node_bounds); + ndpi_free(tree); +} + +void free_knn(ndpi_knn knn, int row) +{ + free_2d_double(knn.distances, row); + free_2d_int(knn.indices, row); +} + +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* metrics.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: elee <elee@student.42.us.org> +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2017/06/28 10:45:32 by elee #+# #+# */ +/* Updated: 2017/06/28 16:52:59 by elee ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "ball.h" + +double manhattan_dist(double *x1, double *x2, int size) +{ + double d = 0; + int i; + + for (i = 0; i < size; i++) + d += fabs(x1[i] - x2[i]); + return (d); +} + +double min_dist(t_btree *tree, int i_node, double *pt) +{ + double dist_pt; + + dist_pt = manhattan_dist(pt, tree->node_bounds[0][i_node], tree->n_features); + return (fmax(0.0, dist_pt - tree->node_data[i_node].radius)); +} + +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* neighbors_heap.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: elee <elee@student.42.us.org> +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2017/06/28 10:45:06 by elee #+# #+# */ +/* Updated: 2017/06/28 20:17:58 by elee ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "ball.h" + +void dual_swap(double *darr, int *iarr, int i1, int i2) +{ + double dtmp; + int itmp; + + dtmp = darr[i1]; + darr[i1] = darr[i2]; + darr[i2] = dtmp; + itmp = iarr[i1]; + iarr[i1] = iarr[i2]; + iarr[i2] = itmp; +} + +t_nheap *nheap_init(int n_pts, int n_nbrs) +{ + t_nheap *h; + int i, j; + + h = (t_nheap*)ndpi_malloc(sizeof(t_nheap)); + h->n_pts = n_pts; + h->n_nbrs = n_nbrs; + h->distances = (double**)ndpi_malloc(sizeof(double*) * n_pts); + for (i = 0; i < n_pts; i++) + { + h->distances[i] = (double*)ndpi_malloc(sizeof(double) * n_nbrs); + for (j = 0; j < n_nbrs; j++) + h->distances[i][j] = INFINITY; + } + h->indices = (int**)ndpi_malloc(sizeof(int*) * n_pts); + for (i = 0; i < n_pts; i++) + h->indices[i] = (int*)ndpi_calloc(sizeof(int), n_nbrs); + return (h); +} + +double nheap_largest(t_nheap *h, int row) +{ + return (h->distances[row][0]); +} + +int nheap_push(t_nheap *h, int row, double val, int i_val) +{ + int i, ic1, ic2, i_swap; + int size; + double *dist_arr; + int *ind_arr; + + size = h->n_nbrs; + dist_arr = h->distances[row]; + ind_arr = h->indices[row]; + + // if distance is already greater than the furthest element, don't push + if (val > dist_arr[0]) + return (0); + + // insert the values at position 0 + dist_arr[0] = val; + ind_arr[0] = i_val; + + // descend the heap, swapping values until the max heap criterion is met + i = 0; + while (TRUE) + { + ic1 = 2 * i + 1; + ic2 = ic1 + 1; + + if (ic1 >= size) + break ; + else if (ic2 >= size) + { + if (dist_arr[ic1] > val) + i_swap = ic1; + else + break ; + } + else if (dist_arr[ic1] >= dist_arr[ic2]) + { + if (val < dist_arr[ic1]) + i_swap = ic1; + else + break ; + } + else + { + if (val < dist_arr[ic2]) + i_swap = ic2; + else + break ; + } + dist_arr[i] = dist_arr[i_swap]; + ind_arr[i] = ind_arr[i_swap]; + i = i_swap; + } + + dist_arr[i] = val; + ind_arr[i] = i_val; + + return (0); +} + +void simultaneous_sort(double *dist, int *idx, int size) +{ + int pivot_idx, i, store_idx; + double pivot_val; + + if (size <= 1) + ; + else if (size == 2) + { + if (dist[0] > dist[1]) + dual_swap(dist, idx, 0, 1); + } + else if (size == 3) + { + if (dist[0] > dist[1]) + dual_swap(dist, idx, 0, 1); + if (dist[1] > dist[2]) + { + dual_swap(dist, idx, 1, 2); + if (dist[0] > dist[1]) + dual_swap(dist, idx, 0, 1); + } + } + else + { + pivot_idx = size / 2; + if (dist[0] > dist[size - 1]) + dual_swap(dist, idx, 0, size - 1); + if (dist[size - 1] > dist[pivot_idx]) + { + dual_swap(dist, idx, size - 1, pivot_idx); + if (dist[0] > dist[size - 1]) + dual_swap(dist, idx, 0, size - 1); + } + pivot_val = dist[size - 1]; + + store_idx = 0; + for (i = 0; i < size - 1; i++) + { + if (dist[i] < pivot_val) + { + dual_swap(dist, idx, i, store_idx); + store_idx++; + } + } + dual_swap(dist, idx, store_idx, size - 1); + pivot_idx = store_idx; + if (pivot_idx > 1) + simultaneous_sort(dist, idx, pivot_idx); + if (pivot_idx * 2 < size) + simultaneous_sort(dist + pivot_idx + 1, idx + pivot_idx + 1, size - pivot_idx - 1); + } +} + +void nheap_sort(t_nheap *h) +{ + int row; + + for (row = 0; row < h->n_pts; row++) + simultaneous_sort(h->distances[row], h->indices[row], h->n_nbrs); +} + +ndpi_knn nheap_get_arrays(t_nheap *h) +{ + ndpi_knn output; + + nheap_sort(h); + output.distances = h->distances; + output.indices = h->indices; + output.n_samples = h->n_pts; + output.n_neighbors = h->n_nbrs; + ndpi_free(h); + return (output); +} diff --git a/src/lib/third_party/src/kdtree.c b/src/lib/third_party/src/kdtree.c new file mode 100644 index 000000000..a3a2823c8 --- /dev/null +++ b/src/lib/third_party/src/kdtree.c @@ -0,0 +1,664 @@ +/* + Code taken from https://github.com/jtsiomb/kdtree + + See https://www.cs.cmu.edu/~ckingsf/bioinfo-lectures/kdtrees.pdf for details + + for high cardinalities, KD-trees are outpaced by + - https://github.com/nmslib/hnswlib + - https://en.wikipedia.org/wiki/Hierarchical_navigable_small_world +*/ +/* + This file is part of ``kdtree'', a library for working with kd-trees. + Copyright (C) 2007-2011 John Tsiombikas <nuclear@member.fsf.org> + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + OF SUCH DAMAGE. +*/ +/* single nearest neighbor search written by Tamas Nepusz <tamas@cs.rhul.ac.uk> */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "kdtree.h" + +#if defined(WIN32) || defined(__WIN32__) +#include <malloc.h> +#endif + +#ifdef USE_LIST_NODE_ALLOCATOR + +#ifndef NO_PTHREADS +#include <pthread.h> +#else + +#ifndef I_WANT_THREAD_BUGS +#error "You are compiling with the fast list node allocator, with pthreads disabled! This WILL break if used from multiple threads." +#endif /* I want thread bugs */ + +#endif /* pthread support */ +#endif /* use list node allocator */ + +#include "ndpi_api.h" + +struct kdhyperrect { + int dim; + double *min, *max; /* minimum/maximum coords */ +}; + +struct kdnode { + double *pos; + int dir; + void *data; + + struct kdnode *left, *right; /* negative/positive side */ +}; + +struct res_node { + struct kdnode *item; + double dist_sq; + struct res_node *next; +}; + +struct kdtree { + int dim; + struct kdnode *root; + struct kdhyperrect *rect; + void (*destr)(void*); +}; + +struct kdres { + struct kdtree *tree; + struct res_node *rlist, *riter; + int size; +}; + +#define SQ(x) ((x) * (x)) + + +static void clear_rec(struct kdnode *node, void (*destr)(void*)); +static int insert_rec(struct kdnode **node, const double *pos, void *data, int dir, int dim); +static int rlist_insert(struct res_node *list, struct kdnode *item, double dist_sq); +static void clear_results(struct kdres *set); + +static struct kdhyperrect* hyperrect_create(int dim, const double *min, const double *max); +static void hyperrect_free(struct kdhyperrect *rect); +static struct kdhyperrect* hyperrect_duplicate(const struct kdhyperrect *rect); +static void hyperrect_extend(struct kdhyperrect *rect, const double *pos); +static double hyperrect_dist_sq(struct kdhyperrect *rect, const double *pos); + +#ifdef USE_LIST_NODE_ALLOCATOR +static struct res_node *alloc_resnode(void); +static void free_resnode(struct res_node*); +#else +#define alloc_resnode() ndpi_malloc(sizeof(struct res_node)) +#define free_resnode(n) ndpi_free(n) +#endif + + + +struct kdtree *kd_create(int k) +{ + struct kdtree *tree; + + if(!(tree = ndpi_malloc(sizeof *tree))) { + return 0; + } + + tree->dim = k; + tree->root = 0; + tree->destr = 0; + tree->rect = 0; + + return tree; +} + +void kd_free(struct kdtree *tree) +{ + if(tree) { + kd_clear(tree); + ndpi_free(tree); + } +} + +static void clear_rec(struct kdnode *node, void (*destr)(void*)) +{ + if(!node) return; + + clear_rec(node->left, destr); + clear_rec(node->right, destr); + + if(destr) { + destr(node->data); + } + ndpi_free(node->pos); + ndpi_free(node); +} + +void kd_clear(struct kdtree *tree) +{ + clear_rec(tree->root, tree->destr); + tree->root = 0; + + if (tree->rect) { + hyperrect_free(tree->rect); + tree->rect = 0; + } +} + +void kd_data_destructor(struct kdtree *tree, void (*destr)(void*)) +{ + tree->destr = destr; +} + + +static int insert_rec(struct kdnode **nptr, const double *pos, void *data, int dir, int dim) +{ + int new_dir; + struct kdnode *node; + + if(!*nptr) { + if(!(node = ndpi_malloc(sizeof *node))) { + return -1; + } + if(!(node->pos = ndpi_malloc(dim * sizeof(double)))) { + ndpi_free(node); + return -1; + } + memcpy(node->pos, pos, dim * sizeof(double)); + node->data = data; + node->dir = dir; + node->left = node->right = 0; + *nptr = node; + return 0; + } + + node = *nptr; + new_dir = (node->dir + 1) % dim; + if(pos[node->dir] < node->pos[node->dir]) { + return insert_rec(&(*nptr)->left, pos, data, new_dir, dim); + } + return insert_rec(&(*nptr)->right, pos, data, new_dir, dim); +} + +int kd_insert(struct kdtree *tree, const double *pos, void *data) +{ + if (insert_rec(&tree->root, pos, data, 0, tree->dim)) { + return -1; + } + + if (tree->rect == 0) { + tree->rect = hyperrect_create(tree->dim, pos, pos); + } else { + hyperrect_extend(tree->rect, pos); + } + + return 0; +} + +static int find_nearest(struct kdnode *node, const double *pos, double range, struct res_node *list, int ordered, int dim) +{ + double dist_sq, dx; + int i, ret, added_res = 0; + + if(!node) return 0; + + dist_sq = 0; + for(i=0; i<dim; i++) { + dist_sq += SQ(node->pos[i] - pos[i]); + } + if(dist_sq <= SQ(range)) { + if(rlist_insert(list, node, ordered ? dist_sq : -1.0) == -1) { + return -1; + } + added_res = 1; + } + + dx = pos[node->dir] - node->pos[node->dir]; + + ret = find_nearest(dx <= 0.0 ? node->left : node->right, pos, range, list, ordered, dim); + if(ret >= 0 && fabs(dx) < range) { + added_res += ret; + ret = find_nearest(dx <= 0.0 ? node->right : node->left, pos, range, list, ordered, dim); + } + if(ret == -1) { + return -1; + } + added_res += ret; + + return added_res; +} + +#if 0 +static int find_nearest_n(struct kdnode *node, const double *pos, double range, int num, struct rheap *heap, int dim) +{ + double dist_sq, dx; + int i, ret, added_res = 0; + + if(!node) return 0; + + /* if the photon is close enough, add it to the result heap */ + dist_sq = 0; + for(i=0; i<dim; i++) { + dist_sq += SQ(node->pos[i] - pos[i]); + } + if(dist_sq <= range_sq) { + if(heap->size >= num) { + /* get furthest element */ + struct res_node *maxelem = rheap_get_max(heap); + + /* and check if the new one is closer than that */ + if(maxelem->dist_sq > dist_sq) { + rheap_remove_max(heap); + + if(rheap_insert(heap, node, dist_sq) == -1) { + return -1; + } + added_res = 1; + + range_sq = dist_sq; + } + } else { + if(rheap_insert(heap, node, dist_sq) == -1) { + return =1; + } + added_res = 1; + } + } + + + /* find signed distance from the splitting plane */ + dx = pos[node->dir] - node->pos[node->dir]; + + ret = find_nearest_n(dx <= 0.0 ? node->left : node->right, pos, range, num, heap, dim); + if(ret >= 0 && fabs(dx) < range) { + added_res += ret; + ret = find_nearest_n(dx <= 0.0 ? node->right : node->left, pos, range, num, heap, dim); + } + +} +#endif + +static void kd_nearest_i(struct kdnode *node, const double *pos, struct kdnode **result, double *result_dist_sq, struct kdhyperrect* rect) +{ + int dir = node->dir; + int i; + double dummy, dist_sq; + struct kdnode *nearer_subtree, *farther_subtree; + double *nearer_hyperrect_coord, *farther_hyperrect_coord; + + /* Decide whether to go left or right in the tree */ + dummy = pos[dir] - node->pos[dir]; + if (dummy <= 0) { + nearer_subtree = node->left; + farther_subtree = node->right; + nearer_hyperrect_coord = rect->max + dir; + farther_hyperrect_coord = rect->min + dir; + } else { + nearer_subtree = node->right; + farther_subtree = node->left; + nearer_hyperrect_coord = rect->min + dir; + farther_hyperrect_coord = rect->max + dir; + } + + if (nearer_subtree) { + /* Slice the hyperrect to get the hyperrect of the nearer subtree */ + dummy = *nearer_hyperrect_coord; + *nearer_hyperrect_coord = node->pos[dir]; + /* Recurse down into nearer subtree */ + kd_nearest_i(nearer_subtree, pos, result, result_dist_sq, rect); + /* Undo the slice */ + *nearer_hyperrect_coord = dummy; + } + + /* Check the distance of the point at the current node, compare it + * with our best so far */ + dist_sq = 0; + for(i=0; i < rect->dim; i++) { + dist_sq += SQ(node->pos[i] - pos[i]); + } + if (dist_sq < *result_dist_sq) { + *result = node; + *result_dist_sq = dist_sq; + } + + if (farther_subtree) { + /* Get the hyperrect of the farther subtree */ + dummy = *farther_hyperrect_coord; + *farther_hyperrect_coord = node->pos[dir]; + /* Check if we have to recurse down by calculating the closest + * point of the hyperrect and see if it's closer than our + * minimum distance in result_dist_sq. */ + if (hyperrect_dist_sq(rect, pos) < *result_dist_sq) { + /* Recurse down into farther subtree */ + kd_nearest_i(farther_subtree, pos, result, result_dist_sq, rect); + } + /* Undo the slice on the hyperrect */ + *farther_hyperrect_coord = dummy; + } +} + +struct kdres *kd_nearest(struct kdtree *kd, const double *pos) +{ + struct kdhyperrect *rect; + struct kdnode *result; + struct kdres *rset; + double dist_sq; + int i; + + if (!kd) return 0; + if (!kd->rect) return 0; + + /* Allocate result set */ + if(!(rset = ndpi_malloc(sizeof *rset))) { + return 0; + } + if(!(rset->rlist = alloc_resnode())) { + ndpi_free(rset); + return 0; + } + rset->rlist->next = 0; + rset->tree = kd; + + /* Duplicate the bounding hyperrectangle, we will work on the copy */ + if (!(rect = hyperrect_duplicate(kd->rect))) { + kd_res_free(rset); + return 0; + } + + /* Our first guesstimate is the root node */ + result = kd->root; + dist_sq = 0; + for (i = 0; i < kd->dim; i++) + dist_sq += SQ(result->pos[i] - pos[i]); + + /* Search for the nearest neighbour recursively */ + kd_nearest_i(kd->root, pos, &result, &dist_sq, rect); + + /* Free the copy of the hyperrect */ + hyperrect_free(rect); + + /* Store the result */ + if (result) { + if (rlist_insert(rset->rlist, result, -1.0) == -1) { + kd_res_free(rset); + return 0; + } + rset->size = 1; + kd_res_rewind(rset); + return rset; + } else { + kd_res_free(rset); + return 0; + } +} + +/* ---- nearest N search ---- */ +/* + static kdres *kd_nearest_n(struct kdtree *kd, const double *pos, int num) + { + int ret; + struct kdres *rset; + + if(!(rset = ndpi_malloc(sizeof *rset))) { + return 0; + } + if(!(rset->rlist = alloc_resnode())) { + ndpi_free(rset); + return 0; + } + rset->rlist->next = 0; + rset->tree = kd; + + if((ret = find_nearest_n(kd->root, pos, range, num, rset->rlist, kd->dim)) == -1) { + kd_res_free(rset); + return 0; + } + rset->size = ret; + kd_res_rewind(rset); + return rset; + }*/ + +struct kdres *kd_nearest_range(struct kdtree *kd, const double *pos, double range) +{ + int ret; + struct kdres *rset; + + if(!(rset = ndpi_malloc(sizeof *rset))) { + return 0; + } + if(!(rset->rlist = alloc_resnode())) { + ndpi_free(rset); + return 0; + } + rset->rlist->next = 0; + rset->tree = kd; + + if((ret = find_nearest(kd->root, pos, range, rset->rlist, 0, kd->dim)) == -1) { + kd_res_free(rset); + return 0; + } + rset->size = ret; + kd_res_rewind(rset); + return rset; +} + +void kd_res_free(struct kdres *rset) +{ + clear_results(rset); + free_resnode(rset->rlist); + ndpi_free(rset); +} + +int kd_res_size(struct kdres *set) +{ + return (set->size); +} + +void kd_res_rewind(struct kdres *rset) +{ + rset->riter = rset->rlist->next; +} + +int kd_res_end(struct kdres *rset) +{ + return rset->riter == 0; +} + +int kd_res_next(struct kdres *rset) +{ + rset->riter = rset->riter->next; + return rset->riter != 0; +} + +double *kd_res_item(struct kdres *rset, double **user_data) +{ + if(rset->riter) { + if(user_data) + *user_data = rset->riter->item->data; + + return(rset->riter->item->pos); + } + + return NULL; +} + +void *kd_res_item_data(struct kdres *set) +{ + return kd_res_item(set, 0); +} + +/* ---- hyperrectangle helpers ---- */ +static struct kdhyperrect* hyperrect_create(int dim, const double *min, const double *max) +{ + size_t size = dim * sizeof(double); + struct kdhyperrect* rect = 0; + + if (!(rect = ndpi_malloc(sizeof(struct kdhyperrect)))) { + return 0; + } + + rect->dim = dim; + if (!(rect->min = ndpi_malloc(size))) { + ndpi_free(rect); + return 0; + } + if (!(rect->max = ndpi_malloc(size))) { + ndpi_free(rect->min); + ndpi_free(rect); + return 0; + } + memcpy(rect->min, min, size); + memcpy(rect->max, max, size); + + return rect; +} + +static void hyperrect_free(struct kdhyperrect *rect) +{ + ndpi_free(rect->min); + ndpi_free(rect->max); + ndpi_free(rect); +} + +static struct kdhyperrect* hyperrect_duplicate(const struct kdhyperrect *rect) +{ + return hyperrect_create(rect->dim, rect->min, rect->max); +} + +static void hyperrect_extend(struct kdhyperrect *rect, const double *pos) +{ + int i; + + for (i=0; i < rect->dim; i++) { + if (pos[i] < rect->min[i]) { + rect->min[i] = pos[i]; + } + if (pos[i] > rect->max[i]) { + rect->max[i] = pos[i]; + } + } +} + +static double hyperrect_dist_sq(struct kdhyperrect *rect, const double *pos) +{ + int i; + double result = 0; + + for (i=0; i < rect->dim; i++) { + if (pos[i] < rect->min[i]) { + result += SQ(rect->min[i] - pos[i]); + } else if (pos[i] > rect->max[i]) { + result += SQ(rect->max[i] - pos[i]); + } + } + + return result; +} + +/* ---- static helpers ---- */ + +#ifdef USE_LIST_NODE_ALLOCATOR +/* special list node allocators. */ +static struct res_node *free_nodes; + +#ifndef NO_PTHREADS +static pthread_mutex_t alloc_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +static struct res_node *alloc_resnode(void) +{ + struct res_node *node; + +#ifndef NO_PTHREADS + pthread_mutex_lock(&alloc_mutex); +#endif + + if(!free_nodes) { + node = ndpi_malloc(sizeof *node); + } else { + node = free_nodes; + free_nodes = free_nodes->next; + node->next = 0; + } + +#ifndef NO_PTHREADS + pthread_mutex_unlock(&alloc_mutex); +#endif + + return node; +} + +static void free_resnode(struct res_node *node) +{ +#ifndef NO_PTHREADS + pthread_mutex_lock(&alloc_mutex); +#endif + + node->next = free_nodes; + free_nodes = node; + +#ifndef NO_PTHREADS + pthread_mutex_unlock(&alloc_mutex); +#endif +} +#endif /* list node allocator or not */ + + +/* inserts the item. if dist_sq is >= 0, then do an ordered insert */ +/* TODO make the ordering code use heapsort */ +static int rlist_insert(struct res_node *list, struct kdnode *item, double dist_sq) +{ + struct res_node *rnode; + + if(!(rnode = alloc_resnode())) { + return -1; + } + rnode->item = item; + rnode->dist_sq = dist_sq; + + if(dist_sq >= 0.0) { + while(list->next && list->next->dist_sq < dist_sq) { + list = list->next; + } + } + rnode->next = list->next; + list->next = rnode; + return 0; +} + +static void clear_results(struct kdres *rset) +{ + struct res_node *tmp, *node = rset->rlist->next; + + while(node) { + tmp = node; + node = node->next; + free_resnode(tmp); + } + + rset->rlist->next = 0; +} |