test/rand.cpp
author Dmitriy Morozov <dmitriy@mrzv.org>
Tue, 14 Jul 2009 09:23:09 -0700
changeset 0 e2bb6f169431
permissions -rw-r--r--
Initial commit: ANN 1.1.1

//----------------------------------------------------------------------
//	File:			rand.cpp
//	Programmer:		Sunil Arya and David Mount
//	Description:	Routines for random point generation
//	Last modified:	08/04/06 (Version 1.1.1)
//----------------------------------------------------------------------
// Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
// David Mount.  All Rights Reserved.
// 
// This software and related documentation is part of the Approximate
// Nearest Neighbor Library (ANN).  This software is provided under
// the provisions of the Lesser GNU Public License (LGPL).  See the
// file ../ReadMe.txt for further information.
// 
// The University of Maryland (U.M.) and the authors make no
// representations about the suitability or fitness of this software for
// any purpose.  It is provided "as is" without express or implied
// warranty.
//----------------------------------------------------------------------
// History:
//	Revision 0.1  03/04/98
//		Initial release
//	Revision 0.2  03/26/98
//		Changed random/srandom declarations for SGI's.
//	Revision 1.0  04/01/05
//		annClusGauss centers distributed over [-1,1] rather than [0,1]
//		Added annClusOrthFlats distribution
//		Changed procedure names to avoid namespace conflicts
//		Added annClusFlats distribution
//		Added rand/srand option and fixed annRan0() initialization.
//	Revision 1.1.1  08/04/06
//		Added planted distribution
//----------------------------------------------------------------------

#include "rand.h"						// random generator declarations

using namespace std;					// make std:: accessible

//----------------------------------------------------------------------
//	Globals
//----------------------------------------------------------------------
int		annIdum = 0;					// used for random number generation

//------------------------------------------------------------------------
//	annRan0 - (safer) uniform random number generator
//
//	The code given here is taken from "Numerical Recipes in C" by
//	William Press, Brian Flannery, Saul Teukolsky, and William
//	Vetterling. The task of the code is to do an additional randomizing
//	shuffle on the system-supplied random number generator to make it
//	safer to use. 
//
//	Returns a uniform deviate between 0.0 and 1.0 using the
//	system-supplied routine random() or rand(). Set the global
//	annIdum to any negative value to initialise or reinitialise
//	the sequence.
//------------------------------------------------------------------------

double annRan0()
{
	const int TAB_SIZE = 97;			// table size: any large number
	int j;

	static double y, v[TAB_SIZE];
	static int iff = 0;
	const double RAN_DIVISOR = double(ANN_RAND_MAX + 1UL);
	if (RAN_DIVISOR < 0) {
		cout << "RAN_DIVISOR " << RAN_DIVISOR << endl;
		exit(0);
	}

	//--------------------------------------------------------------------
	// As a precaution against misuse, we will always initialize on the
	// first call, even if "annIdum" is not set negative. Determine
	// "maxran", the next integer after the largest representable value
	// of type int. We assume this is a factor of 2 smaller than the
	// corresponding value of type unsigned int. 
	//--------------------------------------------------------------------

	if (annIdum < 0 || iff == 0) {		// initialize
		iff = 1;
		ANN_SRAND(annIdum);				// (re)seed the generator
		annIdum = 1;

		for (j = 0; j < TAB_SIZE; j++)	// exercise the system routine
			ANN_RAND();					// (values intentionally ignored)

		for (j = 0; j < TAB_SIZE; j++)	// then save TAB_SIZE-1 values
			v[j] = ANN_RAND();
		y = ANN_RAND();					// generate starting value
	 }

	//--------------------------------------------------------------------
	// This is where we start if not initializing. Use the previously
	// saved random number y to get an index j between 1 and TAB_SIZE-1.
	// Then use the corresponding v[j] for both the next j and as the
	// output number.
	//--------------------------------------------------------------------

	j = int(TAB_SIZE * (y / RAN_DIVISOR));
	y = v[j];
	v[j] = ANN_RAND();					// refill the table entry
	return y / RAN_DIVISOR;
}

//------------------------------------------------------------------------
//	annRanInt - generate a random integer from {0,1,...,n-1}
//
//		If n == 0, then -1 is returned.
//------------------------------------------------------------------------

static int annRanInt(
	int					n)
{
	int r = (int) (annRan0()*n);
	if (r == n) r--;					// (in case annRan0() == 1 or n == 0)
	return r;
}

//------------------------------------------------------------------------
//	annRanUnif - generate a random uniform in [lo,hi]
//------------------------------------------------------------------------

static double annRanUnif(
	double				lo,
	double				hi)
{
	return annRan0()*(hi-lo) + lo;
}

//------------------------------------------------------------------------
//	annRanGauss - Gaussian random number generator
//		Returns a normally distributed deviate with zero mean and unit
//		variance, using annRan0() as the source of uniform deviates.
//------------------------------------------------------------------------

static double annRanGauss()
{
	static int iset=0;
	static double gset;

	if (iset == 0) {					// we don't have a deviate handy
		double v1, v2;
		double r = 2.0;
		while (r >= 1.0) {
			//------------------------------------------------------------
			// Pick two uniform numbers in the square extending from -1 to
			// +1 in each direction, see if they are in the circle of radius
			// 1.  If not, try again 
			//------------------------------------------------------------
			v1 = annRanUnif(-1, 1);
			v2 = annRanUnif(-1, 1);
			r = v1 * v1 + v2 * v2;
		}
		double fac = sqrt(-2.0 * log(r) / r);
		//-----------------------------------------------------------------
		// Now make the Box-Muller transformation to get two normal
		// deviates.  Return one and save the other for next time.
		//-----------------------------------------------------------------
		gset = v1 * fac;
		iset = 1;						// set flag
		return v2 * fac;
	}
	else {								// we have an extra deviate handy
		iset = 0;						// so unset the flag
		return gset;					// and return it
	}
}

//------------------------------------------------------------------------
//	annRanLaplace - Laplacian random number generator
//		Returns a Laplacian distributed deviate with zero mean and
//		unit variance, using annRan0() as the source of uniform deviates. 
//
//				prob(x) = b/2 * exp(-b * |x|).
//
//		b is chosen to be sqrt(2.0) so that the variance of the Laplacian
//		distribution [2/(b^2)] becomes 1. 
//------------------------------------------------------------------------

static double annRanLaplace() 
{
	const double b = 1.4142136;

	double laprand = -log(annRan0()) / b;
	double sign = annRan0();
	if (sign < 0.5) laprand = -laprand;
	return(laprand);
}

//----------------------------------------------------------------------
//	annUniformPts - Generate uniformly distributed points
//		A uniform distribution over [-1,1].
//----------------------------------------------------------------------

void annUniformPts(				// uniform distribution
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim)			// dimension
{
	for (int i = 0; i < n; i++) {
		for (int d = 0; d < dim; d++) {
			pa[i][d] = (ANNcoord) (annRanUnif(-1,1));
		}
	}
}

//----------------------------------------------------------------------
//	annGaussPts - Generate Gaussian distributed points
//		A Gaussian distribution with zero mean and the given standard
//		deviation.
//----------------------------------------------------------------------

void annGaussPts(						// Gaussian distribution
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim,			// dimension
	double				std_dev)		// standard deviation
{
	for (int i = 0; i < n; i++) {
		for (int d = 0; d < dim; d++) {
			pa[i][d] = (ANNcoord) (annRanGauss() * std_dev);
		}
	}
}

//----------------------------------------------------------------------
//	annLaplacePts - Generate Laplacian distributed points
//		Generates a Laplacian distribution (zero mean and unit variance).
//----------------------------------------------------------------------

void annLaplacePts(				// Laplacian distribution
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim)			// dimension
{
	for (int i = 0; i < n; i++) {
		for (int d = 0; d < dim; d++) {
			pa[i][d] = (ANNcoord) annRanLaplace();
		}
	}
}

//----------------------------------------------------------------------
//	annCoGaussPts - Generate correlated Gaussian distributed points
//		Generates a Gauss-Markov distribution of zero mean and unit
//		variance.
//----------------------------------------------------------------------

void annCoGaussPts(				// correlated-Gaussian distribution
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim,			// dimension
	double				correlation)	// correlation
{
	double std_dev_w = sqrt(1.0 - correlation * correlation);
	for (int i = 0; i < n; i++) {
		double previous = annRanGauss();
		pa[i][0] = (ANNcoord) previous;
		for (int d = 1; d < dim; d++) {
			previous = correlation*previous + std_dev_w*annRanGauss();
			pa[i][d] = (ANNcoord) previous;
		} 
	}
}

//----------------------------------------------------------------------
//	annCoLaplacePts - Generate correlated Laplacian distributed points
//		Generates a Laplacian-Markov distribution of zero mean and unit
//		variance.
//----------------------------------------------------------------------

void annCoLaplacePts(			// correlated-Laplacian distribution
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim,			// dimension
	double				correlation)	// correlation
{
	double wn;
	double corr_sq = correlation * correlation;

	for (int i = 0; i < n; i++) {
		double previous = annRanLaplace();
		pa[i][0] = (ANNcoord) previous;
		for (int d = 1; d < dim; d++) {
			double temp = annRan0();
			if (temp < corr_sq)
				wn = 0.0;
			else
				wn = annRanLaplace();
			previous = correlation * previous + wn;
			pa[i][d] = (ANNcoord) previous;
		} 
	}
}

//----------------------------------------------------------------------
//	annClusGaussPts - Generate clusters of Gaussian distributed points
//		Cluster centers are uniformly distributed over [-1,1], and the
//		standard deviation within each cluster is fixed.
//
//		Note: Once cluster centers have been set, they are not changed,
//		unless new_clust = true.  This is so that subsequent calls generate
//		points from the same distribution.  It follows, of course, that any
//		attempt to change the dimension or number of clusters without
//		generating new clusters is asking for trouble.
//
//		Note: Cluster centers are not generated by a call to uniformPts().
//		Although this could be done, it has been omitted for
//		compatibility with annClusGaussPts() in the colored version,
//		rand_c.cc.
//----------------------------------------------------------------------

void annClusGaussPts(			// clustered-Gaussian distribution
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim,			// dimension
	int					n_clus,			// number of colors
	ANNbool				new_clust,		// generate new clusters.
	double				std_dev)		// standard deviation within clusters
{
	static ANNpointArray clusters = NULL;// cluster storage

	if (clusters == NULL || new_clust) {// need new cluster centers
		if (clusters != NULL)			// clusters already exist
			annDeallocPts(clusters);	// get rid of them
		clusters = annAllocPts(n_clus, dim);
										// generate cluster center coords
		for (int i = 0; i < n_clus; i++) {
			for (int d = 0; d < dim; d++) {
				clusters[i][d] = (ANNcoord) annRanUnif(-1,1);
			}
		}
	}

	for (int i = 0; i < n; i++) {
		int c = annRanInt(n_clus);				// generate cluster index
		for (int d = 0; d < dim; d++) {
		  pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + clusters[c][d]);
		}
	}
}

//----------------------------------------------------------------------
//	annClusOrthFlats - points clustered along orthogonal flats
//
//		This distribution consists of a collection points clustered
//		among a collection of axis-aligned low dimensional flats in
//		the hypercube [-1,1]^d.  A set of n_clus orthogonal flats are
//		generated, each whose dimension is a random number between 1
//		and max_dim.  The points are evenly distributed among the clusters.
//		For each cluster, we generate points uniformly distributed along
//		the flat within the hypercube.
//
//		This is done as follows.  Each cluster is defined by a d-element
//		control vector whose components are either:
//		
//				CO_FLAG indicating that this component is to be generated
//						uniformly in [-1,1],
//				x		a value other than CO_FLAG in the range [-1,1],
//						which indicates that this coordinate is to be
//						generated as x plus a Gaussian random deviation
//						with the given standard deviation.
//						
//		The number of zero components is the dimension of the flat, which
//		is a random integer in the range from 1 to max_dim.  The points
//		are disributed between clusters in nearly equal sized groups.
//
//		Note: Once cluster centers have been set, they are not changed,
//		unless new_clust = true.  This is so that subsequent calls generate
//		points from the same distribution.  It follows, of course, that any
//		attempt to change the dimension or number of clusters without
//		generating new clusters is asking for trouble.
//
//		To make this a bad scenario at query time, query points should be
//		selected from a different distribution, e.g. uniform or Gaussian.
//
//		We use a little programming trick to generate groups of roughly
//		equal size.  If n is the total number of points, and n_clus is
//		the number of clusters, then the c-th cluster (0 <= c < n_clus)
//		is given floor((n+c)/n_clus) points.  It can be shown that this
//		will exactly consume all n points.
//
//		This procedure makes use of the utility procedure, genOrthFlat
//		which generates points in one orthogonal flat, according to
//		the given control vector.
//
//----------------------------------------------------------------------
const double CO_FLAG = 999;						// special flag value

static void genOrthFlat(		// generate points on an orthog flat
	ANNpointArray		pa,				// point array
	int					n,				// number of points
	int					dim,			// dimension
	double				*control,		// control vector
	double				std_dev)		// standard deviation
{
	for (int i = 0; i < n; i++) {				// generate each point
		for (int d = 0; d < dim; d++) {			// generate each coord
			if (control[d] == CO_FLAG)			// dimension on flat
				pa[i][d] = (ANNcoord) annRanUnif(-1,1);
			else								// dimension off flat
				pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + control[d]);
		}
	}
}

void annClusOrthFlats(			// clustered along orthogonal flats
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim,			// dimension
	int					n_clus,			// number of colors
	ANNbool				new_clust,		// generate new clusters.
	double				std_dev,		// standard deviation within clusters
	int					max_dim)		// maximum dimension of the flats
{
	static ANNpointArray control = NULL;		// control vectors

	if (control == NULL || new_clust) {			// need new cluster centers
		if (control != NULL) {					// clusters already exist
			annDeallocPts(control);				// get rid of them
		}
		control = annAllocPts(n_clus, dim);

		for (int c = 0; c < n_clus; c++) {		// generate clusters
			int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat
			for (int d = 0; d < dim; d++) {		// generate side locations
												// prob. of picking next dim
				double Prob = ((double) n_dim)/((double) (dim-d));
				if (annRan0() < Prob) {			// add this one to flat
					control[c][d] = CO_FLAG;	// flag this entry
					n_dim--;					// one fewer dim to fill
				}
				else {							// don't take this one
					control[c][d] = annRanUnif(-1,1);// random value in [-1,1]
				}
			}
		}
	}
	int offset = 0;								// offset in pa array
	for (int c = 0; c < n_clus; c++) {			// generate clusters
		int pick = (n+c)/n_clus;				// number of points to pick
												// generate the points
		genOrthFlat(pa+offset, pick, dim, control[c], std_dev);
		offset += pick;							// increment offset
	}
}

//----------------------------------------------------------------------
//	annClusEllipsoids - points clustered around axis-aligned ellipsoids
//
//		This distribution consists of a collection points clustered
//		among a collection of low dimensional ellipsoids whose axes
//		are alligned with the coordinate axes in the hypercube [-1,1]^d.
//		The objective is to model distributions in which the points are
//		distributed in lower dimensional subspaces, and within this
//		lower dimensional space the points are distributed with a
//		Gaussian distribution (with no correlation between the
//		dimensions).
//
//		The distribution is given the number of clusters or "colors"
//		(n_clus), maximum number of dimensions (max_dim) of the lower
//		dimensional subspace, a "small" standard deviation
//		(std_dev_small), and a "large" standard deviation range
//		(std_dev_lo, std_dev_hi).
//
//		The algorithm generates n_clus cluster centers uniformly from
//		the hypercube [-1,1]^d.  For each cluster, it selects the
//		dimension of the subspace as a random number r between 1 and
//		max_dim.  These are the dimensions of the ellipsoid.  Then it
//		generates a d-element std dev vector whose entries are the
//		standard deviation for the coordinates of each cluster in the
//		distribution.  Among the d-element control vector, r randomly
//		chosen values are chosen uniformly from the range [std_dev_lo,
//		std_dev_hi].  The remaining values are set to std_dev_small.
//
//		Note that annClusGaussPts is a special case of this in which
//		max_dim = 0, and std_dev = std_dev_small.
//
//		If the flag new_clust is set, then new cluster centers are
//		generated.
//
//		This procedure makes use of the utility procedure genGauss
//		which generates points distributed according to a Gaussian
//		distribution.
//
//----------------------------------------------------------------------

static void genGauss(			// generate points on a general Gaussian
	ANNpointArray		pa,				// point array
	int					n,				// number of points
	int					dim,			// dimension
	double				*center,		// center vector
	double				*std_dev)		// standard deviation vector
{
	for (int i = 0; i < n; i++) {
		for (int d = 0; d < dim; d++) {
			pa[i][d] = (ANNcoord) (std_dev[d]*annRanGauss() + center[d]);
		}
	}
}

void annClusEllipsoids(			// clustered around ellipsoids
	ANNpointArray		pa,				// point array (modified)
	int					n,				// number of points
	int					dim,			// dimension
	int					n_clus,			// number of colors
	ANNbool				new_clust,		// generate new clusters.
	double				std_dev_small,	// small standard deviation
	double				std_dev_lo,		// low standard deviation for ellipses
	double				std_dev_hi,		// high standard deviation for ellipses
	int					max_dim)		// maximum dimension of the flats
{
	static ANNpointArray centers = NULL;		// cluster centers
	static ANNpointArray std_dev = NULL;		// standard deviations

	if (centers == NULL || new_clust) {			// need new cluster centers
		if (centers != NULL)					// clusters already exist
			annDeallocPts(centers);				// get rid of them
		if (std_dev != NULL)					// std deviations already exist
			annDeallocPts(std_dev);				// get rid of them

		centers = annAllocPts(n_clus, dim);		// alloc new clusters and devs
		std_dev	 = annAllocPts(n_clus, dim);

		for (int i = 0; i < n_clus; i++) {		// gen cluster center coords
			for (int d = 0; d < dim; d++) {
				centers[i][d] = (ANNcoord) annRanUnif(-1,1);
			}
		}
		for (int c = 0; c < n_clus; c++) {		// generate cluster std dev
			int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat
			for (int d = 0; d < dim; d++) {		// generate std dev's
												// prob. of picking next dim
				double Prob = ((double) n_dim)/((double) (dim-d));
				if (annRan0() < Prob) {			// add this one to ellipse
												// generate random std dev
					std_dev[c][d] = annRanUnif(std_dev_lo, std_dev_hi);
					n_dim--;					// one fewer dim to fill
				}
				else {							// don't take this one
					std_dev[c][d] = std_dev_small;// use small std dev
				}
			}
		}
	}

	int offset = 0;								// next slot to fill
	for (int c = 0; c < n_clus; c++) {			// generate clusters
		int pick = (n+c)/n_clus;				// number of points to pick
												// generate the points
		genGauss(pa+offset, pick, dim, centers[c], std_dev[c]);
		offset += pick;							// increment offset in array
	}
}

//----------------------------------------------------------------------
//	annPlanted - Generates points from a "planted" distribution
//		In high dimensional spaces, interpoint distances tend to be
//		highly clustered around the mean value.  Approximate nearest
//		neighbor searching makes little sense in this context, unless it
//		is the case that each query point is significantly closer to its
//		nearest neighbor than to other points.  Thus, the query points
//		should be planted close to the data points. Given a source data
//		set, this procedure generates a set of query points having this
//		property.
//
//		We are given a source data array and a standard deviation.  We
//		generate points as follows.  We select a random point from the
//		source data set, and we generate a Gaussian point centered about
//		this random point and perturbed by a normal distributed random
//		variable with mean zero and the given standard deviation along
//		each coordinate.
//
//		Note that this essentially the same a clustered Gaussian
//		distribution, but where the cluster centers are given by the
//		source data set.
//----------------------------------------------------------------------

void annPlanted(				// planted nearest neighbors
	ANNpointArray	pa,			// point array (modified)
	int				n,			// number of points
	int				dim,		// dimension
	ANNpointArray	src,		// source point array
	int				n_src,		// source size
	double			std_dev)	// standard deviation about source
{
	for (int i = 0; i < n; i++) {
		int c = annRanInt(n_src);			// generate source index
		for (int d = 0; d < dim; d++) {
		  pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + src[c][d]);
		}
	}
}