1 """
2 Distribution class
3
4 $Id: distribution.py 9609 2008-11-20 17:38:04Z jbednar $
5 """
6 __version__='$Revision: 9609 $'
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 from math import pi
27
28 from numpy.oldnumeric import innerproduct, array, exp, argmax
29
30 from topo.base.arrayutil import arg, wrap
31
33 """
34 Holds a distribution of the values f(x) associated with a variable x.
35
36 A Distribution is a histogram-like object that is a dictionary of
37 samples. Each sample is an x:f(x) pair, where x is called the bin
38 and f(x) is called the value(). Each bin's value is typically
39 maintained as the sum of all the values that have been placed into
40 it.
41
42 The bin axis is continuous, and can represent a continuous
43 quantity without discretization. Alternatively, this class can be
44 used as a traditional histogram by either discretizing the bin
45 number before adding each sample, or by binning the values in the
46 final Distribution.
47
48 Distributions are bounded by the specified axis_bounds, and can
49 either be cyclic (like directions or hues) or non-cyclic. For
50 cyclic distributions, samples provided outside the axis_bounds
51 will be wrapped back into the bound range, as is appropriate for
52 quantities like directions. For non-cyclic distributions,
53 providing samples outside the axis_bounds will result in a
54 ValueError.
55
56 In addition to the values, can also return the counts, i.e., the
57 number of times that a sample has been added with the given bin.
58
59 Not all instances of this class will be a true distribution in the
60 mathematical sense; e.g. the values will have to be normalized
61 before they can be considered a probability distribution.
62
63 If keep_peak=True, the value stored in each bin will be the
64 maximum of all values ever added, instead of the sum. The
65 distribution will thus be a record of the maximum value
66 seen at each bin, also known as an envelope.
67 """
68
69
70
71
72
73
74 undefined_vals = 0
75
76 - def __init__(self, axis_bounds=(0.0, 2*pi), cyclic=False, keep_peak=False):
77 self._data = {}
78 self._counts = {}
79
80
81
82
83
84 self.total_count = 0
85 self.total_value = 0.0
86
87 self.axis_bounds = axis_bounds
88 self.axis_range = axis_bounds[1] - axis_bounds[0]
89 self.cyclic = cyclic
90 self.keep_peak = keep_peak
91
92
93
94
95
96
97
99 """
100 Allows add() method to be used via the '+' operator; i.e.,
101 Distribution + dictionary does Distribution.add(dictionary).
102 """
103 self.add(a)
104 return None
105
106
108 """
109 Return the value of the specified bin.
110
111 (Return None if there is no such bin.)
112 """
113 return self._data.get(bin)
114
115
117 """
118 Return the count from the specified bin.
119
120 (Return None if there is no such bin.)
121 """
122 return self._counts.get(bin)
123
124
126 """
127 Return a list of values.
128
129 Various statistics can then be calculated if desired:
130
131 sum(vals) (total of all values)
132 max(vals) (highest value in any bin)
133
134 Note that the bin-order of values returned does not necessarily
135 match that returned by counts().
136 """
137 return self._data.values()
138
139
141 """
142 Return a list of values.
143
144 Various statistics can then be calculated if desired:
145
146 sum(counts) (total of all counts)
147 max(counts) (highest count in any bin)
148
149 Note that the bin-order of values returned does not necessarily
150 match that returned by values().
151 """
152 return self._counts.values()
153
154
156 """
157 Return a list of bins that have been populated.
158 """
159 return self._data.keys()
160
161
162 - def add(self, new_data):
163 """
164 Add a set of new data in the form of a dictionary of (bin,
165 value) pairs. If the bin already exists, the value is added
166 to the current value. If the bin doesn't exist, one is created
167 with that value.
168
169 Bin numbers outside axis_bounds are allowed for cyclic=True,
170 but otherwise a ValueError is raised.
171
172 If keep_peak=True, the value of the bin is the maximum of the
173 current value and the supplied value. That is, the bin stores
174 the peak value seen so far. Note that each call will increase
175 the total_value and total_count (and thus decrease the
176 value_mag() and count_mag()) even if the value doesn't happen
177 to be the maximum seen so far, since each data point still
178 helps improve the sampling and thus the confidence.
179 """
180 for bin in new_data.keys():
181
182 if self.cyclic==False:
183 if not (self.axis_bounds[0] <= bin <= self.axis_bounds[1]):
184 raise ValueError("Bin outside bounds.")
185
186
187 new_bin = bin
188
189 if new_bin not in self._data:
190 self._data[new_bin] = 0.0
191 self._counts[new_bin] = 0
192
193 new_value = new_data[bin]
194 self.total_value += new_value
195 self._counts[new_bin] += 1
196 self.total_count += 1
197
198 if self.keep_peak == True:
199 if new_value > self._data[new_bin]: self._data[new_bin] = new_value
200 else:
201 self._data[new_bin] += new_value
202
203
205 """Return the bin with the largest value."""
206 return self._data.keys()[argmax(self._data.values())]
207
208
210 """
211 Return a continuous, interpolated equivalent of the max_value_bin().
212
213 For a cyclic distribution, this is the direction of the vector
214 sum (see vector_sum()).
215
216 For a non-cyclic distribution, this is the arithmetic average
217 of the data on the bin_axis, where each bin is weighted by its
218 value.
219 """
220 if self.cyclic == True:
221 return self.vector_sum()[1]
222 else:
223 return self._weighted_average()
224
225
227 """
228 Return the vector sum of the data as a tuple (magnitude, avgbinnum).
229
230 Each bin contributes a vector of length equal to its value, at
231 a direction corresponding to the bin number. Specifically,
232 the total bin number range is mapped into a direction range
233 [0,2pi].
234
235 For a cyclic distribution, the avgbinnum will be a continuous
236 measure analogous to the max_value_bin() of the distribution.
237 But this quantity has more precision than max_value_bin()
238 because it is computed from the entire distribution instead of
239 just the peak bin. However, it is likely to be useful only
240 for uniform or very dense sampling; with sparse, non-uniform
241 sampling the estimates will be biased significantly by the
242 particular samples chosen.
243
244 The avgbinnum is not meaningful when the magnitude is 0,
245 because a zero-length vector has no direction. To find out
246 whether such cases occurred, you can compare the value of
247 undefined_vals before and after a series of calls to this
248 function.
249
250 """
251
252 r = self._data.values()
253 theta = self._bins_to_radians(array(self._data.keys()))
254 v_sum = innerproduct(r, exp(theta*1j))
255
256 magnitude = abs(v_sum)
257 direction = arg(v_sum)
258
259 if v_sum == 0:
260 self.undefined_vals += 1
261
262 direction_radians = self._radians_to_bins(direction)
263
264
265 wrapped_direction = wrap(self.axis_bounds[0], self.axis_bounds[1], direction_radians)
266
267 return (magnitude, wrapped_direction)
268
269
271 """Return the sum of each value times its bin."""
272 return innerproduct(self._data.keys(), self._data.values())
273
274
276 """
277 Return the weighted_sum divided by the sum of the values
278 """
279 return self._safe_divide(self.weighted_sum(),sum(self._data.values()))
280
281
283 """
284 Return a measure of the peakedness of the distribution. The
285 calculation differs depending on whether this is a cyclic
286 variable. For a cyclic variable, returns the magnitude of the
287 vector_sum() divided by the sum_value() (see
288 _vector_selectivity for more details). For a non-cyclic
289 variable, returns the max_value_bin()) as a proportion of the
290 sum_value() (see _relative_selectivity for more details).
291 """
292 if self.cyclic == True:
293 return self._vector_selectivity()
294 else:
295 return self._relative_selectivity()
296
297
298
299
300
301
303 """
304 Return max_value_bin()) as a proportion of the sum_value().
305
306 This quantity is a measure of how strongly the distribution is
307 biased towards the max_value_bin(). For a smooth,
308 single-lobed distribution with an inclusive, non-cyclic range,
309 this quantity is an analog to vector_selectivity. To be a
310 precise analog for arbitrary distributions, it would need to
311 compute some measure of the selectivity that works like the
312 weighted_average() instead of the max_value_bin(). The result
313 is scaled such that if all bins are identical, the selectivity
314 is 0.0, and if all bins but one are zero, the selectivity is
315 1.0.
316 """
317
318
319 if len(self._data) <= 1:
320 return 1.0
321
322 proportion = self._safe_divide( max(self._data.values()),
323 sum(self._data.values()) )
324 offset = 1.0/len(self._data)
325 scaled = (proportion-offset)/(1.0-offset)
326
327
328
329
330 if scaled >= 0.0:
331 return scaled
332 else:
333 return 0.0
334
335
337 """
338 Return the magnitude of the vector_sum() divided by the sum_value().
339
340 This quantity is a vector-based measure of the peakedness of
341 the distribution. If only a single bin has a non-zero value(),
342 the selectivity will be 1.0, and if all bins have the same
343 value() then the selectivity will be 0.0. Other distributions
344 will result in intermediate values.
345
346 For a distribution with a sum_value() of zero (i.e. all bins
347 empty), the selectivity is undefined. Assuming that one will
348 usually be looking for high selectivity, we return zero in such
349 a case so that high selectivity will not mistakenly be claimed.
350 To find out whether such cases occurred, you can compare the
351 value of undefined_values() before and after a series of
352 calls to this function.
353 """
354 return self._safe_divide(self.vector_sum()[0], sum(self._data.values()))
355
356
358 """Return the value of a single bin as a proportion of total_value."""
359 return self._safe_divide(self._data.get(bin), self.total_value)
360
361
363 """Return the count of a single bin as a proportion of total_count."""
364 return self._safe_divide(float(self._counts.get(bin)), float(self.total_count))
365
366
367
369 """
370 Convert a bin number to a direction in radians.
371
372 Works for NumPy arrays of bin numbers, returning
373 an array of directions.
374 """
375 return (2*pi)*bin/self.axis_range
376
377
379 """
380 Convert a direction in radians into a bin number.
381
382 Works for NumPy arrays of direction, returning
383 an array of bin numbers.
384 """
385 return direction*self.axis_range/(2*pi)
386
387
389 """
390 Division routine that avoids division-by-zero errors
391 (returning zero in such cases) but keeps track of them
392 for undefined_values().
393 """
394 if denominator==0:
395 self.undefined_vals += 1
396 return 0
397 else:
398 return numerator/denominator
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414