src/kd_split.cpp
author Dmitriy Morozov <dmitriy@mrzv.org>
Fri, 17 Jul 2009 13:54:30 -0700
changeset 3 f1e962ffa1c2
parent 0 e2bb6f169431
permissions -rw-r--r--
ANNkd_tree:annkSearch() is reentrant, in particular it works with OpenMP

//----------------------------------------------------------------------
// File:			kd_split.cpp
// Programmer:		Sunil Arya and David Mount
// Description:		Methods for splitting kd-trees
// Last modified:	01/04/05 (Version 1.0)
//----------------------------------------------------------------------
// Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
// David Mount.  All Rights Reserved.
// 
// This software and related documentation is part of the Approximate
// Nearest Neighbor Library (ANN).  This software is provided under
// the provisions of the Lesser GNU Public License (LGPL).  See the
// file ../ReadMe.txt for further information.
// 
// The University of Maryland (U.M.) and the authors make no
// representations about the suitability or fitness of this software for
// any purpose.  It is provided "as is" without express or implied
// warranty.
//----------------------------------------------------------------------
// History:
//	Revision 0.1  03/04/98
//		Initial release
//	Revision 1.0  04/01/05
//----------------------------------------------------------------------

#include "kd_tree.h"					// kd-tree definitions
#include "kd_util.h"					// kd-tree utilities
#include "kd_split.h"					// splitting functions

//----------------------------------------------------------------------
//	Constants
//----------------------------------------------------------------------

const double ERR = 0.001;				// a small value
const double FS_ASPECT_RATIO = 3.0;		// maximum allowed aspect ratio
										// in fair split. Must be >= 2.

//----------------------------------------------------------------------
//	kd_split - Bentley's standard splitting routine for kd-trees
//		Find the dimension of the greatest spread, and split
//		just before the median point along this dimension.
//----------------------------------------------------------------------

void kd_split(
	ANNpointArray		pa,				// point array (permuted on return)
	ANNidxArray			pidx,			// point indices
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
										// find dimension of maximum spread
	cut_dim = annMaxSpread(pa, pidx, n, dim);
	n_lo = n/2;							// median rank
										// split about median
	annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
}

//----------------------------------------------------------------------
//	midpt_split - midpoint splitting rule for box-decomposition trees
//
//		This is the simplest splitting rule that guarantees boxes
//		of bounded aspect ratio.  It simply cuts the box with the
//		longest side through its midpoint.  If there are ties, it
//		selects the dimension with the maximum point spread.
//
//		WARNING: This routine (while simple) doesn't seem to work
//		well in practice in high dimensions, because it tends to
//		generate a large number of trivial and/or unbalanced splits.
//		Either kd_split(), sl_midpt_split(), or fair_split() are
//		recommended, instead.
//----------------------------------------------------------------------

void midpt_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;

	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length > max_length) {
			max_length = length;
		}
	}
	ANNcoord max_spread = -1;			// find long side with most spread
	for (d = 0; d < dim; d++) {
										// is it among longest?
		if (double(bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
										// compute its spread
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// is it max so far?
				max_spread = spr;
				cut_dim = d;
			}
		}
	}
										// split along cut_dim at midpoint
	cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim]) / 2;
										// permute points accordingly
	int br1, br2;
	annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
	//------------------------------------------------------------------
	//	On return:		pa[0..br1-1] < cut_val
	//					pa[br1..br2-1] == cut_val
	//					pa[br2..n-1] > cut_val
	//
	//	We can set n_lo to any value in the range [br1..br2].
	//	We choose split so that points are most evenly divided.
	//------------------------------------------------------------------
	if (br1 > n/2) n_lo = br1;
	else if (br2 < n/2) n_lo = br2;
	else n_lo = n/2;
}

//----------------------------------------------------------------------
//	sl_midpt_split - sliding midpoint splitting rule
//
//		This is a modification of midpt_split, which has the nonsensical
//		name "sliding midpoint".  The idea is that we try to use the
//		midpoint rule, by bisecting the longest side.  If there are
//		ties, the dimension with the maximum spread is selected.  If,
//		however, the midpoint split produces a trivial split (no points
//		on one side of the splitting plane) then we slide the splitting
//		(maintaining its orientation) until it produces a nontrivial
//		split. For example, if the splitting plane is along the x-axis,
//		and all the data points have x-coordinate less than the x-bisector,
//		then the split is taken along the maximum x-coordinate of the
//		data points.
//
//		Intuitively, this rule cannot generate trivial splits, and
//		hence avoids midpt_split's tendency to produce trees with
//		a very large number of nodes.
//
//----------------------------------------------------------------------

void sl_midpt_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;

	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length > max_length) {
			max_length = length;
		}
	}
	ANNcoord max_spread = -1;			// find long side with most spread
	for (d = 0; d < dim; d++) {
										// is it among longest?
		if ((bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
										// compute its spread
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// is it max so far?
				max_spread = spr;
				cut_dim = d;
			}
		}
	}
										// ideal split at midpoint
	ANNcoord ideal_cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim])/2;

	ANNcoord min, max;
	annMinMax(pa, pidx, n, cut_dim, min, max);	// find min/max coordinates

	if (ideal_cut_val < min)			// slide to min or max as needed
		cut_val = min;
	else if (ideal_cut_val > max)
		cut_val = max;
	else
		cut_val = ideal_cut_val;

										// permute points accordingly
	int br1, br2;
	annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
	//------------------------------------------------------------------
	//	On return:		pa[0..br1-1] < cut_val
	//					pa[br1..br2-1] == cut_val
	//					pa[br2..n-1] > cut_val
	//
	//	We can set n_lo to any value in the range [br1..br2] to satisfy
	//	the exit conditions of the procedure.
	//
	//	if ideal_cut_val < min (implying br2 >= 1),
	//			then we select n_lo = 1 (so there is one point on left) and
	//	if ideal_cut_val > max (implying br1 <= n-1),
	//			then we select n_lo = n-1 (so there is one point on right).
	//	Otherwise, we select n_lo as close to n/2 as possible within
	//			[br1..br2].
	//------------------------------------------------------------------
	if (ideal_cut_val < min) n_lo = 1;
	else if (ideal_cut_val > max) n_lo = n-1;
	else if (br1 > n/2) n_lo = br1;
	else if (br2 < n/2) n_lo = br2;
	else n_lo = n/2;
}

//----------------------------------------------------------------------
//	fair_split - fair-split splitting rule
//
//		This is a compromise between the kd-tree splitting rule (which
//		always splits data points at their median) and the midpoint
//		splitting rule (which always splits a box through its center.
//		The goal of this procedure is to achieve both nicely balanced
//		splits, and boxes of bounded aspect ratio.
//
//		A constant FS_ASPECT_RATIO is defined. Given a box, those sides
//		which can be split so that the ratio of the longest to shortest
//		side does not exceed ASPECT_RATIO are identified.  Among these
//		sides, we select the one in which the points have the largest
//		spread. We then split the points in a manner which most evenly
//		distributes the points on either side of the splitting plane,
//		subject to maintaining the bound on the ratio of long to short
//		sides. To determine that the aspect ratio will be preserved,
//		we determine the longest side (other than this side), and
//		determine how narrowly we can cut this side, without causing the
//		aspect ratio bound to be exceeded (small_piece).
//
//		This procedure is more robust than either kd_split or midpt_split,
//		but is more complicated as well.  When point distribution is
//		extremely skewed, this degenerates to midpt_split (actually
//		1/3 point split), and when the points are most evenly distributed,
//		this degenerates to kd-split.
//----------------------------------------------------------------------

void fair_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;
	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	cut_dim = 0;
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length > max_length) {
			max_length = length;
			cut_dim = d;
		}
	}

	ANNcoord max_spread = 0;			// find legal cut with max spread
	cut_dim = 0;
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
										// is this side midpoint splitable
										// without violating aspect ratio?
		if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
										// compute spread along this dim
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// best spread so far
				max_spread = spr;
				cut_dim = d;			// this is dimension to cut
			}
		}
	}

	max_length = 0;						// find longest side other than cut_dim
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (d != cut_dim && length > max_length)
			max_length = length;
	}
										// consider most extreme splits
	ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
	ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
	ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut

	int br1, br2;
										// is median below lo_cut ?
	if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
		cut_val = lo_cut;				// cut at lo_cut
		annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
		n_lo = br1;
	}
										// is median above hi_cut?
	else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
		cut_val = hi_cut;				// cut at hi_cut
		annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
		n_lo = br2;
	}
	else {								// median cut preserves asp ratio
		n_lo = n/2;						// split about median
		annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
	}
}

//----------------------------------------------------------------------
//	sl_fair_split - sliding fair split splitting rule
//
//		Sliding fair split is a splitting rule that combines the
//		strengths of both fair split with sliding midpoint split.
//		Fair split tends to produce balanced splits when the points
//		are roughly uniformly distributed, but it can produce many
//		trivial splits when points are highly clustered.  Sliding
//		midpoint never produces trivial splits, and shrinks boxes
//		nicely if points are highly clustered, but it may produce
//		rather unbalanced splits when points are unclustered but not
//		quite uniform.
//
//		Sliding fair split is based on the theory that there are two
//		types of splits that are "good": balanced splits that produce
//		fat boxes, and unbalanced splits provided the cell with fewer
//		points is fat.
//
//		This splitting rule operates by first computing the longest
//		side of the current bounding box.  Then it asks which sides
//		could be split (at the midpoint) and still satisfy the aspect
//		ratio bound with respect to this side.	Among these, it selects
//		the side with the largest spread (as fair split would).	 It
//		then considers the most extreme cuts that would be allowed by
//		the aspect ratio bound.	 This is done by dividing the longest
//		side of the box by the aspect ratio bound.	If the median cut
//		lies between these extreme cuts, then we use the median cut.
//		If not, then consider the extreme cut that is closer to the
//		median.	 If all the points lie to one side of this cut, then
//		we slide the cut until it hits the first point.	 This may
//		violate the aspect ratio bound, but will never generate empty
//		cells.	However the sibling of every such skinny cell is fat,
//		and hence packing arguments still apply.
//
//----------------------------------------------------------------------

void sl_fair_split(
	ANNpointArray		pa,				// point array
	ANNidxArray			pidx,			// point indices (permuted on return)
	const ANNorthRect	&bnds,			// bounding rectangle for cell
	int					n,				// number of points
	int					dim,			// dimension of space
	int					&cut_dim,		// cutting dimension (returned)
	ANNcoord			&cut_val,		// cutting value (returned)
	int					&n_lo)			// num of points on low side (returned)
{
	int d;
	ANNcoord min, max;					// min/max coordinates
	int br1, br2;						// split break points

	ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
	cut_dim = 0;
	for (d = 1; d < dim; d++) {			// find length of longest box side
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (length	> max_length) {
			max_length = length;
			cut_dim = d;
		}
	}

	ANNcoord max_spread = 0;			// find legal cut with max spread
	cut_dim = 0;
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
										// is this side midpoint splitable
										// without violating aspect ratio?
		if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
										// compute spread along this dim
			ANNcoord spr = annSpread(pa, pidx, n, d);
			if (spr > max_spread) {		// best spread so far
				max_spread = spr;
				cut_dim = d;			// this is dimension to cut
			}
		}
	}

	max_length = 0;						// find longest side other than cut_dim
	for (d = 0; d < dim; d++) {
		ANNcoord length = bnds.hi[d] - bnds.lo[d];
		if (d != cut_dim && length > max_length)
			max_length = length;
	}
										// consider most extreme splits
	ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
	ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
	ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut
										// find min and max along cut_dim
	annMinMax(pa, pidx, n, cut_dim, min, max);
										// is median below lo_cut?
	if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
		if (max > lo_cut) {				// are any points above lo_cut?
			cut_val = lo_cut;			// cut at lo_cut
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = br1;					// balance if there are ties
		}
		else {							// all points below lo_cut
			cut_val = max;				// cut at max value
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = n-1;
		}
	}
										// is median above hi_cut?
	else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
		if (min < hi_cut) {				// are any points below hi_cut?
			cut_val = hi_cut;			// cut at hi_cut
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = br2;					// balance if there are ties
		}
		else {							// all points above hi_cut
			cut_val = min;				// cut at min value
			annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
			n_lo = 1;
		}
	}
	else {								// median cut is good enough
		n_lo = n/2;						// split about median
		annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
	}
}