Package topo :: Package misc :: Module distribution
[hide private]
[frames] | no frames]

Source Code for Module topo.misc.distribution

  1  """ 
  2  Distribution class 
  3   
  4  $Id: distribution.py 9609 2008-11-20 17:38:04Z jbednar $ 
  5  """ 
  6  __version__='$Revision: 9609 $' 
  7  # To do: 
  8  # 
  9  # - wrap bins for cyclic histograms 
 10  # - check use of float() in count_mag() etc 
 11  # - clarify comment about negative selectivity 
 12  # 
 13  # - function to return value in a range (like a real histogram) 
 14  # - cache values 
 15  # - assumes cyclic axes start at 0: include a shift based on range 
 16  # 
 17  # - is there a way to make this work for arrays without mentioning 
 18  #   "array" anywhere in here? 
 19  # - should this be two classes: one for the core (which would be 
 20  #   small though) and another for statistics? 
 21   
 22   
 23   
 24  # The basic functions do not have any dependencies, but these imports 
 25  # are needed for some of the statistical functions (e.g. vector sum). 
 26  from math import pi 
 27   
 28  from numpy.oldnumeric import innerproduct, array, exp, argmax 
 29   
 30  from topo.base.arrayutil import arg, wrap 
 31   
32 -class Distribution(object):
33 """ 34 Holds a distribution of the values f(x) associated with a variable x. 35 36 A Distribution is a histogram-like object that is a dictionary of 37 samples. Each sample is an x:f(x) pair, where x is called the bin 38 and f(x) is called the value(). Each bin's value is typically 39 maintained as the sum of all the values that have been placed into 40 it. 41 42 The bin axis is continuous, and can represent a continuous 43 quantity without discretization. Alternatively, this class can be 44 used as a traditional histogram by either discretizing the bin 45 number before adding each sample, or by binning the values in the 46 final Distribution. 47 48 Distributions are bounded by the specified axis_bounds, and can 49 either be cyclic (like directions or hues) or non-cyclic. For 50 cyclic distributions, samples provided outside the axis_bounds 51 will be wrapped back into the bound range, as is appropriate for 52 quantities like directions. For non-cyclic distributions, 53 providing samples outside the axis_bounds will result in a 54 ValueError. 55 56 In addition to the values, can also return the counts, i.e., the 57 number of times that a sample has been added with the given bin. 58 59 Not all instances of this class will be a true distribution in the 60 mathematical sense; e.g. the values will have to be normalized 61 before they can be considered a probability distribution. 62 63 If keep_peak=True, the value stored in each bin will be the 64 maximum of all values ever added, instead of the sum. The 65 distribution will thus be a record of the maximum value 66 seen at each bin, also known as an envelope. 67 """ 68 69 # Holds the number of times that undefined values have been 70 # returned from calculations for any instance of this class, 71 # e.g. calls to vector_direction() or vector_selectivity() when no 72 # value is non-zero. Useful for warning users when the values are 73 # not meaningful. 74 undefined_vals = 0 75
76 - def __init__(self, axis_bounds=(0.0, 2*pi), cyclic=False, keep_peak=False):
77 self._data = {} 78 self._counts = {} 79 80 # total_count and total_value hold the total number and sum 81 # (respectively) of values that have ever been provided for 82 # each bin. For a simple distribution these will be the same as 83 # sum_counts() and sum_values(). 84 self.total_count = 0 85 self.total_value = 0.0 86 87 self.axis_bounds = axis_bounds 88 self.axis_range = axis_bounds[1] - axis_bounds[0] 89 self.cyclic = cyclic 90 self.keep_peak = keep_peak
91 92 93 94 ### JABHACKALERT! The semantics of this operation are incorrect, because 95 ### an expression like x+y should not modify x, while this does. It could 96 ### be renamed __iadd__, to implement += (which has the correct semantics), 97 ### but it's not yet clear how to do that.
98 - def __add__(self,a):
99 """ 100 Allows add() method to be used via the '+' operator; i.e., 101 Distribution + dictionary does Distribution.add(dictionary). 102 """ 103 self.add(a) 104 return None
105 106
107 - def get_value(self, bin):
108 """ 109 Return the value of the specified bin. 110 111 (Return None if there is no such bin.) 112 """ 113 return self._data.get(bin)
114 115
116 - def get_count(self, bin):
117 """ 118 Return the count from the specified bin. 119 120 (Return None if there is no such bin.) 121 """ 122 return self._counts.get(bin)
123 124
125 - def values(self):
126 """ 127 Return a list of values. 128 129 Various statistics can then be calculated if desired: 130 131 sum(vals) (total of all values) 132 max(vals) (highest value in any bin) 133 134 Note that the bin-order of values returned does not necessarily 135 match that returned by counts(). 136 """ 137 return self._data.values()
138 139
140 - def counts(self):
141 """ 142 Return a list of values. 143 144 Various statistics can then be calculated if desired: 145 146 sum(counts) (total of all counts) 147 max(counts) (highest count in any bin) 148 149 Note that the bin-order of values returned does not necessarily 150 match that returned by values(). 151 """ 152 return self._counts.values()
153 154
155 - def bins(self):
156 """ 157 Return a list of bins that have been populated. 158 """ 159 return self._data.keys()
160 161
162 - def add(self, new_data):
163 """ 164 Add a set of new data in the form of a dictionary of (bin, 165 value) pairs. If the bin already exists, the value is added 166 to the current value. If the bin doesn't exist, one is created 167 with that value. 168 169 Bin numbers outside axis_bounds are allowed for cyclic=True, 170 but otherwise a ValueError is raised. 171 172 If keep_peak=True, the value of the bin is the maximum of the 173 current value and the supplied value. That is, the bin stores 174 the peak value seen so far. Note that each call will increase 175 the total_value and total_count (and thus decrease the 176 value_mag() and count_mag()) even if the value doesn't happen 177 to be the maximum seen so far, since each data point still 178 helps improve the sampling and thus the confidence. 179 """ 180 for bin in new_data.keys(): 181 182 if self.cyclic==False: 183 if not (self.axis_bounds[0] <= bin <= self.axis_bounds[1]): 184 raise ValueError("Bin outside bounds.") 185 # CEBALERT: Neet to support wrapping of bin values 186 # else: new_bin = wrap(self.axis_bounds[0], self.axis_bounds[1], bin) 187 new_bin = bin 188 189 if new_bin not in self._data: 190 self._data[new_bin] = 0.0 191 self._counts[new_bin] = 0 192 193 new_value = new_data[bin] 194 self.total_value += new_value 195 self._counts[new_bin] += 1 196 self.total_count += 1 197 198 if self.keep_peak == True: 199 if new_value > self._data[new_bin]: self._data[new_bin] = new_value 200 else: 201 self._data[new_bin] += new_value
202 203
204 - def max_value_bin(self):
205 """Return the bin with the largest value.""" 206 return self._data.keys()[argmax(self._data.values())]
207 208
209 - def weighted_average(self):
210 """ 211 Return a continuous, interpolated equivalent of the max_value_bin(). 212 213 For a cyclic distribution, this is the direction of the vector 214 sum (see vector_sum()). 215 216 For a non-cyclic distribution, this is the arithmetic average 217 of the data on the bin_axis, where each bin is weighted by its 218 value. 219 """ 220 if self.cyclic == True: 221 return self.vector_sum()[1] 222 else: 223 return self._weighted_average()
224 225
226 - def vector_sum(self):
227 """ 228 Return the vector sum of the data as a tuple (magnitude, avgbinnum). 229 230 Each bin contributes a vector of length equal to its value, at 231 a direction corresponding to the bin number. Specifically, 232 the total bin number range is mapped into a direction range 233 [0,2pi]. 234 235 For a cyclic distribution, the avgbinnum will be a continuous 236 measure analogous to the max_value_bin() of the distribution. 237 But this quantity has more precision than max_value_bin() 238 because it is computed from the entire distribution instead of 239 just the peak bin. However, it is likely to be useful only 240 for uniform or very dense sampling; with sparse, non-uniform 241 sampling the estimates will be biased significantly by the 242 particular samples chosen. 243 244 The avgbinnum is not meaningful when the magnitude is 0, 245 because a zero-length vector has no direction. To find out 246 whether such cases occurred, you can compare the value of 247 undefined_vals before and after a series of calls to this 248 function. 249 250 """ 251 # vectors are represented in polar form as complex numbers 252 r = self._data.values() 253 theta = self._bins_to_radians(array(self._data.keys())) 254 v_sum = innerproduct(r, exp(theta*1j)) 255 256 magnitude = abs(v_sum) 257 direction = arg(v_sum) 258 259 if v_sum == 0: 260 self.undefined_vals += 1 261 262 direction_radians = self._radians_to_bins(direction) 263 264 # wrap the direction because arctan2 returns principal values 265 wrapped_direction = wrap(self.axis_bounds[0], self.axis_bounds[1], direction_radians) 266 267 return (magnitude, wrapped_direction)
268 269
270 - def weighted_sum(self):
271 """Return the sum of each value times its bin.""" 272 return innerproduct(self._data.keys(), self._data.values())
273 274
275 - def _weighted_average(self):
276 """ 277 Return the weighted_sum divided by the sum of the values 278 """ 279 return self._safe_divide(self.weighted_sum(),sum(self._data.values()))
280 281
282 - def selectivity(self):
283 """ 284 Return a measure of the peakedness of the distribution. The 285 calculation differs depending on whether this is a cyclic 286 variable. For a cyclic variable, returns the magnitude of the 287 vector_sum() divided by the sum_value() (see 288 _vector_selectivity for more details). For a non-cyclic 289 variable, returns the max_value_bin()) as a proportion of the 290 sum_value() (see _relative_selectivity for more details). 291 """ 292 if self.cyclic == True: 293 return self._vector_selectivity() 294 else: 295 return self._relative_selectivity()
296 297 298 # CEBHACKALERT: the definition of selectivity for non-cyclic 299 # quantities probably needs some more thought. 300 # Additionally, this fails the test in testfeaturemap 301 # (see the comment there).
302 - def _relative_selectivity(self):
303 """ 304 Return max_value_bin()) as a proportion of the sum_value(). 305 306 This quantity is a measure of how strongly the distribution is 307 biased towards the max_value_bin(). For a smooth, 308 single-lobed distribution with an inclusive, non-cyclic range, 309 this quantity is an analog to vector_selectivity. To be a 310 precise analog for arbitrary distributions, it would need to 311 compute some measure of the selectivity that works like the 312 weighted_average() instead of the max_value_bin(). The result 313 is scaled such that if all bins are identical, the selectivity 314 is 0.0, and if all bins but one are zero, the selectivity is 315 1.0. 316 """ 317 # A single bin is considered fully selective (but could also 318 # arguably be considered fully unselective) 319 if len(self._data) <= 1: 320 return 1.0 321 322 proportion = self._safe_divide( max(self._data.values()), 323 sum(self._data.values()) ) 324 offset = 1.0/len(self._data) 325 scaled = (proportion-offset)/(1.0-offset) 326 327 # negative scaled is possible 328 # e.g. 2 bins, with values that sum to less than 0.5 329 # this probably isn't what should be done in those cases 330 if scaled >= 0.0: 331 return scaled 332 else: 333 return 0.0
334 335
336 - def _vector_selectivity(self):
337 """ 338 Return the magnitude of the vector_sum() divided by the sum_value(). 339 340 This quantity is a vector-based measure of the peakedness of 341 the distribution. If only a single bin has a non-zero value(), 342 the selectivity will be 1.0, and if all bins have the same 343 value() then the selectivity will be 0.0. Other distributions 344 will result in intermediate values. 345 346 For a distribution with a sum_value() of zero (i.e. all bins 347 empty), the selectivity is undefined. Assuming that one will 348 usually be looking for high selectivity, we return zero in such 349 a case so that high selectivity will not mistakenly be claimed. 350 To find out whether such cases occurred, you can compare the 351 value of undefined_values() before and after a series of 352 calls to this function. 353 """ 354 return self._safe_divide(self.vector_sum()[0], sum(self._data.values()))
355 356
357 - def value_mag(self, bin):
358 """Return the value of a single bin as a proportion of total_value.""" 359 return self._safe_divide(self._data.get(bin), self.total_value)
360 361
362 - def count_mag(self, bin):
363 """Return the count of a single bin as a proportion of total_count.""" 364 return self._safe_divide(float(self._counts.get(bin)), float(self.total_count))
365 # use of float() 366 367
368 - def _bins_to_radians(self, bin):
369 """ 370 Convert a bin number to a direction in radians. 371 372 Works for NumPy arrays of bin numbers, returning 373 an array of directions. 374 """ 375 return (2*pi)*bin/self.axis_range
376 377
378 - def _radians_to_bins(self, direction):
379 """ 380 Convert a direction in radians into a bin number. 381 382 Works for NumPy arrays of direction, returning 383 an array of bin numbers. 384 """ 385 return direction*self.axis_range/(2*pi)
386 387
388 - def _safe_divide(self,numerator,denominator):
389 """ 390 Division routine that avoids division-by-zero errors 391 (returning zero in such cases) but keeps track of them 392 for undefined_values(). 393 """ 394 if denominator==0: 395 self.undefined_vals += 1 396 return 0 397 else: 398 return numerator/denominator
399 400 401 ## # not tested - will it be needed? 402 ## def relative_value(self, bin): 403 ## """ 404 ## Return the value of the given bin as a proportion of the 405 ## sum_value(). 406 ## 407 ## This quantity is on a scale from 0.0 (the 408 ## specified bin is zero and others are nonzero) to 1.0 (the 409 ## specified bin is the only nonzero bin). If the distribution is 410 ## empty the result is undefined; in such a case zero is returned 411 ## and the value returned by undefined_values() is incremented. 412 ## """ 413 ## return self._safe_divide(self._data.get(bin), sum(self.values())) 414