1
2 """module for performing statistical calculations.
3
4 (c) 2007-2009 Matt Hilton
5
6 U{http://astlib.sourceforge.net}
7
8 This module (as you may notice) provides very few statistical routines. It does, however, provide
9 biweight (robust) estimators of location and scale, as described in Beers et al. 1990 (AJ, 100,
10 32), in addition to a robust least squares fitting routine that uses the biweight transform.
11
12 Some routines may fail if they are passed lists with few items and encounter a `divide by zero'
13 error. Where this occurs, the function will return None. An error message will be printed to the
14 console when this happens if astStats.REPORT_ERRORS=True (the default). Testing if an
15 astStats function returns None can be used to handle errors in scripts.
16
17 For extensive statistics modules, the Python bindings for GNU R (U{http://rpy.sourceforge.net}), or
18 SciPy (U{http://www.scipy.org}) are suggested.
19
20 """
21
22 import math
23 import numpy
24 import sys
25
26 REPORT_ERRORS=True
27
28
30 """Calculates the mean average of a list of numbers.
31
32 @type dataList: list
33 @param dataList: input data, must be a one dimensional list
34 @rtype: float
35 @return: mean average
36
37 """
38 sum=0
39 for item in dataList:
40 sum=sum+float(item)
41 if len(dataList)>0:
42 mean=sum/float(len(dataList))
43 else:
44 mean=0
45 return mean
46
47
49 """Calculates the weighted mean average of a two dimensional list (value, weight) of
50 numbers.
51
52 @type dataList: list
53 @param dataList: input data, must be a two dimensional list in format [value, weight]
54 @rtype: float
55 @return: weighted mean average
56
57 """
58 sum=0
59 weightSum=0
60 for item in dataList:
61 sum=sum+float(item[0]*item[1])
62 weightSum=weightSum+item[1]
63 if len(dataList)>0:
64 mean=sum/weightSum
65 else:
66 mean=0
67 return mean
68
69
71 """Calculates the (sample) standard deviation of a list of numbers.
72
73 @type dataList: list
74 @param dataList: input data, must be a one dimensional list
75 @rtype: float
76 @return: standard deviation
77
78 """
79 listMean=mean(dataList)
80 sum=0
81 for item in dataList:
82 sum=sum+(float(item-listMean)*float(item-listMean))
83 if len(dataList)>0:
84 stdev=math.sqrt(sum/(float(len(dataList))-1))
85 else:
86 stdev=0
87 return stdev
88
89
91 """Calculates the root mean square of a list of numbers.
92
93 @type dataList: list
94 @param dataList: input data, must be a one dimensional list
95 @rtype: float
96 @return: root mean square
97
98 """
99 dataListSq=[]
100 for item in dataList:
101 dataListSq.append(item*item)
102 listMeanSq=mean(dataListSq)
103 rms=math.sqrt(listMeanSq)
104
105 return rms
106
107
109 """Calculates the weighted (sample) standard deviation of a list of numbers.
110
111 @type dataList: list
112 @param dataList: input data, must be a two dimensional list in format [value, weight]
113 @rtype: float
114 @return: weighted standard deviation
115
116 @note: Returns None if an error occurs.
117
118 """
119 listMean=weightedMean(dataList)
120 sum=0
121 wSum=0
122 wNonZero=0
123 for item in dataList:
124 if item[1]>0.0:
125 sum=sum+float((item[0]-listMean)/item[1])*float((item[0]-listMean)/item[1])
126 wSum=wSum+float(1.0/item[1])*float(1.0/item[1])
127
128 if len(dataList)>1:
129 nFactor=float(len(dataList))/float(len(dataList)-1)
130 stdev=math.sqrt(nFactor*(sum/wSum))
131 else:
132 if REPORT_ERRORS==True:
133 print """ERROR: astStats.weightedStdev() : dataList contains < 2 items."""
134 stdev=None
135 return stdev
136
137
166
167
169 """Returns an estimate of the mode of a set of values by mode=(3*median)-(2*mean).
170
171 @type dataList: list
172 @param dataList: input data, must be a one dimensional list
173 @rtype: float
174 @return: estimate of mode average
175
176 """
177 mode=(3*median(dataList))-(2*mean(dataList))
178
179 return mode
180
181
183 """Calculates the Median Absolute Deviation of a list of numbers.
184
185 @type dataList: list
186 @param dataList: input data, must be a one dimensional list
187 @rtype: float
188 @return: median absolute deviation
189
190 """
191 listMedian=median(dataList)
192
193
194 diffModuli=[]
195 for item in dataList:
196 diffModuli.append(math.fabs(item-listMedian))
197 diffModuli.sort()
198
199 midValue=float(len(diffModuli)/2.0)
200 fractPart=math.modf(midValue)[0]
201
202 if fractPart==0.5:
203 midValue=math.ceil(midValue)
204
205
206 if midValue<len(diffModuli)-1:
207 MAD=diffModuli[int(midValue)]
208
209 if fractPart!=0.5:
210 prevItem=diffModuli[int(midValue)-1]
211 MAD=(MAD+prevItem)/2.0
212
213 else:
214 MAD=diffModuli[0]
215
216 return MAD
217
218
220 """Calculates the biweight location estimator (like a robust average) of a list of
221 numbers.
222
223 @type dataList: list
224 @param dataList: input data, must be a one dimensional list
225 @type tuningConstant: float
226 @param tuningConstant: 6.0 is recommended.
227 @rtype: float
228 @return: biweight location
229
230 @note: Returns None if an error occurs.
231
232 """
233 C=tuningConstant
234 listMedian=median(dataList)
235 listMAD=MAD(dataList)
236 if listMAD!=0:
237 uValues=[]
238 for item in dataList:
239 uValues.append((item-listMedian)/(C*listMAD))
240
241 top=0
242 bottom=0
243 for i in range(len(uValues)):
244 if math.fabs(uValues[i])<=1.0:
245 top=top+((dataList[i]-listMedian) \
246 *(1.0-(uValues[i]*uValues[i])) \
247 *(1.0-(uValues[i]*uValues[i])))
248
249 bottom=bottom+((1.0-(uValues[i]*uValues[i])) \
250 *(1.0-(uValues[i]*uValues[i])))
251
252 CBI=listMedian+(top/bottom)
253
254 else:
255 if REPORT_ERRORS==True:
256 print """ERROR: astStats: biweightLocation() : MAD() returned 0."""
257 return None
258
259 return CBI
260
261
263 """Calculates the biweight scale estimator (like a robust standard deviation) of a list
264 of numbers.
265
266 @type dataList: list
267 @param dataList: input data, must be a one dimensional list
268 @type tuningConstant: float
269 @param tuningConstant: 9.0 is recommended.
270 @rtype: float
271 @return: biweight scale
272
273 @note: Returns None if an error occurs.
274
275 """
276 C=tuningConstant
277
278
279 listMedian=median(dataList)
280 listMAD=MAD(dataList)
281 diffModuli=[]
282 for item in dataList:
283 diffModuli.append(math.fabs(item-listMedian))
284 uValues=[]
285 for item in dataList:
286 try:
287 uValues.append((item-listMedian)/(C*listMAD))
288 except ZeroDivisionError:
289 if REPORT_ERRORS==True:
290 print """ERROR: astStats.biweightScale() : divide by zero error."""
291 return None
292
293 top=0
294 bottom=0
295 valCount=0
296
297 for i in range(len(uValues)):
298
299 if math.fabs(uValues[i])<=1.0:
300 u2Term=1.0-(uValues[i]*uValues[i])
301 u4Term=math.pow(u2Term, 4)
302 top=top+((diffModuli[i]*diffModuli[i])*u4Term)
303 bottom=bottom+(u2Term*(1.0-(5.0*(uValues[i]*uValues[i]))))
304 valCount=valCount+1
305
306 top=math.sqrt(top)
307 bottom=math.fabs(bottom)
308
309 SBI=math.pow(float(valCount), 0.5)*(top/bottom)
310 return SBI
311
312
314 """Iteratively calculates biweight location and scale, using sigma clipping, for a list
315 of values. The calculation is performed on the first column of a multi-dimensional
316 list; other columns are ignored.
317
318 @type dataList: list
319 @param dataList: input data
320 @type tuningConstant: float
321 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
322 scale estimates
323 @type sigmaCut: float
324 @param sigmaCut: sigma clipping to apply
325 @rtype: dictionary
326 @return: estimate of biweight location, scale, and list of non-clipped data, in the format
327 {'biweightLocation', 'biweightScale', 'dataList'}
328
329 @note: Returns None if an error occurs.
330
331 """
332
333 iterations=0
334 clippedValues=[]
335 for row in dataList:
336 if type(row)==list:
337 clippedValues.append(row[0])
338 else:
339 clippedValues.append(row)
340
341 while iterations<11 and len(clippedValues)>5:
342
343 cbi=biweightLocation(clippedValues, tuningConstant)
344 sbi=biweightScale(clippedValues, tuningConstant)
345
346
347
348
349 if cbi==None or sbi==None:
350
351 if REPORT_ERRORS==True:
352 print """ERROR: astStats : biweightClipped() :
353 divide by zero error."""
354
355 return None
356
357 else:
358
359 clippedValues=[]
360 clippedData=[]
361 for row in dataList:
362 if type(row)==list:
363 if row[0]>cbi-(sigmaCut*sbi) \
364 and row[0]<cbi+(sigmaCut*sbi):
365 clippedValues.append(row[0])
366 clippedData.append(row)
367 else:
368 if row>cbi-(sigmaCut*sbi) \
369 and row<cbi+(sigmaCut*sbi):
370 clippedValues.append(row)
371 clippedData.append(row)
372
373 iterations=iterations+1
374
375 return { 'biweightLocation':cbi ,
376 'biweightScale':sbi,
377 'dataList':clippedData}
378
379
408
409
411 """Performs an ordinary least squares fit on a two dimensional list of numbers.
412 Minimum number of data points is 5.
413
414 @type dataList: list
415 @param dataList: input data, must be a two dimensional list in format [x, y]
416 @rtype: dictionary
417 @return: slope and intercept on y-axis, with associated errors, in the format
418 {'slope', 'intercept', 'slopeError', 'interceptError'}
419
420 @note: Returns None if an error occurs.
421
422 """
423 sumX=0
424 sumY=0
425 sumXY=0
426 sumXX=0
427 n=float(len(dataList))
428 if n>4:
429 for item in dataList:
430 sumX=sumX+item[0]
431 sumY=sumY+item[1]
432 sumXY=sumXY+(item[0]*item[1])
433 sumXX=sumXX+(item[0]*item[0])
434 m=((n*sumXY)-(sumX*sumY))/((n*sumXX)-(sumX*sumX))
435 c=((sumXX*sumY)-(sumX*sumXY))/((n*sumXX)-(sumX*sumX))
436
437 sumRes=0
438 for item in dataList:
439
440 sumRes=sumRes+((item[1]-(m*item[0])-c) \
441 *(item[1]-(m*item[0])-c))
442
443 sigma=math.sqrt((1.0/(n-2))*sumRes)
444
445 mSigma=(sigma*math.sqrt(n))/math.sqrt((n*sumXX)-(sumX*sumX))
446 cSigma=(sigma*math.sqrt(sumXX))/math.sqrt((n*sumXX)-(sumX*sumX))
447 else:
448 if REPORT_ERRORS==True:
449 print """ERROR: astStats.OLSFit() : dataList contains < 5 items."""
450
451 return None
452
453 return {'slope':m,
454 'intercept':c,
455 'slopeError':mSigma,
456 'interceptError':cSigma}
457
458
460 """Calculates the clipped mean and stdev of a list of numbers.
461
462 @type dataList: list
463 @param dataList: input data, one dimensional list of numbers
464 @type sigmaCut: float
465 @param sigmaCut: clipping in Gaussian sigma to apply
466 @type maxIterations: int
467 @param maxIterations: maximum number of iterations
468 @rtype: dictionary
469 @return: format {'clippedMean', 'clippedStdev', 'numPoints'}
470
471 """
472
473 listCopy=[]
474 for d in dataList:
475 listCopy.append(d)
476 listCopy=numpy.array(listCopy)
477
478 iterations=0
479 while iterations < maxIterations and len(listCopy) > 4:
480
481 m=listCopy.mean()
482 s=listCopy.std()
483
484 listCopy=listCopy[numpy.less(abs(listCopy), abs(m+sigmaCut*s))]
485
486 iterations=iterations+1
487
488 return {'clippedMean': m, 'clippedStdev': s, 'numPoints': listCopy.shape[0]}
489
490
492 """Performs a weighted least squares fit on a list of numbers with sigma clipping. Minimum number of data
493 points is 5.
494
495 @type dataList: list
496 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
497 @rtype: dictionary
498 @return: slope and intercept on y-axis, with associated errors, in the format
499 {'slope', 'intercept', 'slopeError', 'interceptError'}
500
501 @note: Returns None if an error occurs.
502
503 """
504
505 iterations=0
506 clippedValues=[]
507 for row in dataList:
508 clippedValues.append(row)
509
510 while iterations<11 and len(clippedValues)>4:
511
512 fitResults=weightedLSFit(clippedValues, "errors")
513
514 if fitResults['slope'] == None:
515
516 if REPORT_ERRORS==True:
517 print """ERROR: astStats : clippedWeightedLSFit() :
518 divide by zero error."""
519
520 return None
521
522 else:
523
524 clippedValues=[]
525 for row in dataList:
526
527
528 fit=fitResults['slope']*row[0]+fitResults['intercept']
529 res=row[1]-fit
530 if abs(res)/row[2] < sigmaCut:
531 clippedValues.append(row)
532
533 iterations=iterations+1
534
535
536 fitResults['numDataPoints']=len(clippedValues)
537
538 return fitResults
539
540
542 """Performs a weighted least squares fit on a three dimensional list of numbers [x, y, y error].
543
544 @type dataList: list
545 @param dataList: input data, must be a three dimensional list in format [x, y, y error]
546 @type weightType: string
547 @param weightType: if "errors", weights are calculated assuming the input data is in the
548 format [x, y, error on y]; if "weights", the weights are assumed to be already calculated and
549 stored in a fourth column [x, y, error on y, weight] (as used by e.g. L{astStats.biweightLSFit})
550 @rtype: dictionary
551 @return: slope and intercept on y-axis, with associated errors, in the format
552 {'slope', 'intercept', 'slopeError', 'interceptError'}
553
554 @note: Returns None if an error occurs.
555
556 """
557 if weightType == "weights":
558 sumW=0
559 sumWX=0
560 sumWY=0
561 sumWXY=0
562 sumWXX=0
563 n=float(len(dataList))
564 if n > 4:
565 for item in dataList:
566 W=item[3]
567 sumWX=sumWX+(W*item[0])
568 sumWY=sumWY+(W*item[1])
569 sumWXY=sumWXY+(W*item[0]*item[1])
570 sumWXX=sumWXX+(W*item[0]*item[0])
571 sumW=sumW+W
572
573
574 try:
575 m=((sumW*sumWXY)-(sumWX*sumWY)) \
576 /((sumW*sumWXX)-(sumWX*sumWX))
577 except ZeroDivisionError:
578 if REPORT_ERRORS == True:
579 print "ERROR: astStats.weightedLSFit() : divide by zero error."
580 return None
581
582 try:
583 c=((sumWXX*sumWY)-(sumWX*sumWXY)) \
584 /((sumW*sumWXX)-(sumWX*sumWX))
585 except ZeroDivisionError:
586 if REPORT_ERRORS == True:
587 print "ERROR: astStats.weightedLSFit() : divide by zero error."
588 return None
589
590 sumRes=0
591 for item in dataList:
592
593 sumRes=sumRes+((item[1]-(m*item[0])-c) \
594 *(item[1]-(m*item[0])-c))
595
596 sigma=math.sqrt((1.0/(n-2))*sumRes)
597
598
599
600 if (n*sumWXX)-(sumWX*sumWX)>0.0:
601
602 mSigma=(sigma*math.sqrt(n)) \
603 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
604
605 cSigma=(sigma*math.sqrt(sumWXX)) \
606 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
607
608 else:
609
610 if REPORT_ERRORS==True:
611 print """ERROR: astStats.weightedLSFit()
612 : divide by zero error."""
613 return None
614
615 else:
616 if REPORT_ERRORS==True:
617 print """ERROR: astStats.weightedLSFit() :
618 dataList contains < 5 items."""
619 return None
620
621 elif weightType == "errors":
622 sumX=0
623 sumY=0
624 sumXY=0
625 sumXX=0
626 sumSigma=0
627 n=float(len(dataList))
628 for item in dataList:
629 sumX=sumX+(item[0]/(item[2]*item[2]))
630 sumY=sumY+(item[1]/(item[2]*item[2]))
631 sumXY=sumXY+((item[0]*item[1])/(item[2]*item[2]))
632 sumXX=sumXX+((item[0]*item[0])/(item[2]*item[2]))
633 sumSigma=sumSigma+(1.0/(item[2]*item[2]))
634 delta=(sumSigma*sumXX)-(sumX*sumX)
635 m=((sumSigma*sumXY)-(sumX*sumY))/delta
636 c=((sumXX*sumY)-(sumX*sumXY))/delta
637 mSigma=math.sqrt(sumSigma/delta)
638 cSigma=math.sqrt(sumXX/delta)
639
640 return {'slope':m,
641 'intercept':c,
642 'slopeError':mSigma,
643 'interceptError':cSigma}
644
645
647 """Performs a weighted least squares fit, where the weights used are the biweight
648 transforms of the residuals to the previous best fit .i.e. the procedure is iterative,
649 and converges very quickly (iterations is set to 10 by default). Minimum number of data
650 points is 10.
651
652 This seems to give slightly different results to the equivalent R routine, so use at your
653 own risk!
654
655 @type dataList: list
656 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
657 @type tuningConstant: float
658 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
659 scale estimates
660 @type sigmaCut: float
661 @param sigmaCut: sigma clipping to apply (set to None if not required)
662 @rtype: dictionary
663 @return: slope and intercept on y-axis, with associated errors, in the format
664 {'slope', 'intercept', 'slopeError', 'interceptError'}
665
666 @note: Returns None if an error occurs.
667
668 """
669
670 dataCopy=[]
671 for row in dataList:
672 dataCopy.append(row)
673
674
675 results=OLSFit(dataCopy)
676 origLen=len(dataCopy)
677 for k in range(10):
678 m=results[0]
679 c=results[1]
680 res=[]
681 for item in dataCopy:
682 res.append((m*item[0]+c)-item[1])
683
684 if len(res)>5:
685
686
687 if sigmaClipping!=None:
688 absRes=[]
689 for item in res:
690 absRes.append(abs(item))
691 sigma=stdev(absRes)
692 count=0
693 for item in absRes:
694 if item>(sigmaClipping*sigma) \
695 and len(dataCopy)>2:
696 del dataCopy[count]
697 del res[count]
698
699
700
701
702 count=count-1
703
704 count=count+1
705
706
707 weights=biweightTransform(res, tuningConstant)
708
709
710
711 wData=[]
712 for i in range(len(dataCopy)):
713 wData.append([ dataCopy[i][0],
714 dataCopy[i][1],
715 weights[i]])
716 results=weightedLSFit(wData, "weights")
717
718 return {'slope':m,
719 'intercept':c,
720 'slopeError':mSigma,
721 'interceptError':cSigma}
722
723
725 """Bins the input data cumulatively.
726
727 @param data: input data, must be a one dimensional list
728 @type binMin: float
729 @param binMin: minimum value from which to bin data
730 @type binMax: float
731 @param binMax: maximum value from which to bin data
732 @type binTotal: int
733 @param binTotal: number of bins
734 @rtype: list
735 @return: binned data, in format [bin centre, frequency]
736
737 """
738
739 binStep=float(binMax-binMin)/binTotal
740 bins=[]
741 totalItems=len(data)
742 for i in range(binTotal):
743 bins.append(0)
744 for item in data:
745 if item>(binMin+(i*binStep)):
746 bins[i]=bins[i]+1.0/totalItems
747
748
749 coords=[]
750 for i in range(binTotal):
751 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
752
753 return coords
754
755
756 -def binner(data, binMin, binMax, binTotal):
757 """Bins the input data..
758
759 @param data: input data, must be a one dimensional list
760 @type binMin: float
761 @param binMin: minimum value from which to bin data
762 @type binMax: float
763 @param binMax: maximum value from which to bin data
764 @type binTotal: int
765 @param binTotal: number of bins
766 @rtype: list
767 @return: binned data, in format [bin centre, frequency]
768
769 """
770
771 binStep=float(binMax-binMin)/binTotal
772 bins=[]
773 for i in range(binTotal):
774 bins.append(0)
775 for item in data:
776 if item>(binMin+(i*binStep)) \
777 and item<=(binMin+((i+1)*binStep)):
778 bins[i]=bins[i]+1
779
780
781 coords=[]
782 for i in range(binTotal):
783 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
784
785 return coords
786
787
789 """Bins the input data, recorded frequency is sum of weights in bin.
790
791 @param data: input data, must be a one dimensional list
792 @type binMin: float
793 @param binMin: minimum value from which to bin data
794 @type binMax: float
795 @param binMax: maximum value from which to bin data
796 @type binTotal: int
797 @param binTotal: number of bins
798 @rtype: list
799 @return: binned data, in format [bin centre, frequency]
800
801 """
802
803 binStep=float(binMax-binMin)/binTotal
804 bins=[]
805 for i in range(binTotal):
806 bins.append(0.0)
807 for item, weight in zip(data, weights):
808 if item>(binMin+(i*binStep)) \
809 and item<=(binMin+((i+1)*binStep)):
810 bins[i]=bins[i]+weight
811
812
813 coords=[]
814 for i in range(binTotal):
815 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
816
817 return coords
818
819
820