tvaLib
Classes | Functions
lib.tools_math Namespace Reference

Classes

class  geneticAlg
 Optimisation. More...
 

Functions

def mat_divide (A, scalar)
 Basic math. More...
 
def local_min_max (lists)
 
def gaus_mvgavg (lists, degree=5)
 Filters on data sets. More...
 
def cat_mvgavg (cat_list, window=2, passes=1)
 
def cat_curvy_mvgavg (cat_list, curvy, window=2.0, passes=1)
 
def reduce_sequence (sequence)
 
def running_sum (a)
 
def remap_data (datapoints, old_map, new_map)
 Constructors and generators. More...
 
def cantorID (obj)
 
def pdfToCDF (pdf_y)
 Statistics. More...
 
def ksTest (df1, df2)
 
def getPercentileKeyFromList (items, percentile, sorting=False, sortingColumnIx=0)
 
def getPercintileBinFromList (data, percentile, sampleSize=0)
 
def getPercintileBinsFromList (data, bins=10, includeLeftEdge=False, sampleSize=0)
 
def sample (data, sampleSize=10000, method='random')
 
def combineMean (means, samples)
 
def combineVariance (variances, means, samples)
 
def combineStdDev (stddevs, means, samples)
 
def pdfStatsOnBinnedFreqData (data, centiles=[25)
 
def setTheoryUnionNIndEvents (events, recursionDepthEstimation=10, smallValueLimit=0.0)
 Probability theory. More...
 

Function Documentation

◆ cantorID()

def lib.tools_math.cantorID (   obj)
Cantor unique ID for 2 objects. 

Definition at line 140 of file tools_math.py.

140 def cantorID(obj):
141  ''' Cantor unique ID for 2 objects. '''
142 
143  return int((obj[0]+obj[1])*(obj[0]+obj[1]+1)*.5+obj[1])
144 
145 
def cantorID(obj)
Definition: tools_math.py:140

◆ cat_curvy_mvgavg()

def lib.tools_math.cat_curvy_mvgavg (   cat_list,
  curvy,
  window = 2.0,
  passes = 1 
)
Similar to cat_mvgavg(), however, the window is treated continously
    using curvy position.
    
    Note:
    =====
    curvy should have the same dimension as cat_list

Definition at line 84 of file tools_math.py.

84 def cat_curvy_mvgavg(cat_list, curvy, window=2.0, passes=1):
85  ''' Similar to cat_mvgavg(), however, the window is treated continously
86  using curvy position.
87 
88  Note:
89  =====
90  curvy should have the same dimension as cat_list
91  '''
92  for pa in range(passes):
93  smoothed = deepcopy(cat_list)
94  for point in range(len(cat_list)):
95  lower_bound_check = min(range(len(curvy)), key=lambda x:abs(curvy[x]-(curvy[point]-window)))
96  upper_bound_check = min(range(len(curvy)), key=lambda x:abs(curvy[x]-(curvy[point]+window)))+1
97  window_values = cat_list[lower_bound_check:upper_bound_check]
98  smoothed[point] = max(set(window_values), key=window_values.count)
99  cat_list = smoothed
100  return cat_list
101 
102 
def cat_curvy_mvgavg(cat_list, curvy, window=2.0, passes=1)
Definition: tools_math.py:84

◆ cat_mvgavg()

def lib.tools_math.cat_mvgavg (   cat_list,
  window = 2,
  passes = 1 
)
Return a list of categories/values smoothed according to a window. 
    window is the search radius on either side

Definition at line 70 of file tools_math.py.

70 def cat_mvgavg(cat_list, window=2, passes=1):
71  ''' Return a list of categories/values smoothed according to a window.
72  window is the search radius on either side'''
73  for pa in range(passes):
74  smoothed = deepcopy(cat_list)
75  for point in range(len(cat_list)):
76  lower_bound_check = max(0,point-window)
77  upper_bound_check = min(len(cat_list)-1,point+window+1)
78  window_values = cat_list[lower_bound_check:upper_bound_check]
79  smoothed[point] = max(set(window_values), key=window_values.count)
80  cat_list = smoothed
81  return cat_list
82 
83 
def cat_mvgavg(cat_list, window=2, passes=1)
Definition: tools_math.py:70

◆ combineMean()

def lib.tools_math.combineMean (   means,
  samples 
)
Combine various means knowing their sample size. AKA Weighted average.
    The order of each list must be consistent.

Definition at line 231 of file tools_math.py.

231 def combineMean(means, samples):
232  ''' Combine various means knowing their sample size. AKA Weighted average.
233  The order of each list must be consistent.
234  '''
235  try: return sum([mean*sample for mean,sample in zip(means,samples)])/float(sum(samples))
236  except ZeroDivisionError: return 0
237 
def combineMean(means, samples)
Definition: tools_math.py:231

◆ combineStdDev()

def lib.tools_math.combineStdDev (   stddevs,
  means,
  samples 
)
http://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
    The order of each list must be consistent.

Definition at line 246 of file tools_math.py.

246 def combineStdDev(stddevs, means, samples):
247  ''' http://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
248  The order of each list must be consistent.
249  '''
250  return m.sqrt(combineVariance([m.pow(x,2) for x in stddevs], means, samples))
251 
252 
253 
def combineStdDev(stddevs, means, samples)
Definition: tools_math.py:246
def combineVariance(variances, means, samples)
Definition: tools_math.py:238

◆ combineVariance()

def lib.tools_math.combineVariance (   variances,
  means,
  samples 
)
http://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
    The order of each list must be consistent.

Definition at line 238 of file tools_math.py.

238 def combineVariance(variances, means, samples):
239  ''' http://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
240  The order of each list must be consistent.
241  '''
242  Xc = combineMean(means, samples)
243  try: return (sum([variance*sample for variance,sample in zip(variances,samples)])+sum([m.pow(mean-Xc,2)*sample for mean,sample in zip(means,samples)]))/float(sum(samples))
244  except ZeroDivisionError: return 0
245 
def combineVariance(variances, means, samples)
Definition: tools_math.py:238
def combineMean(means, samples)
Definition: tools_math.py:231

◆ gaus_mvgavg()

def lib.tools_math.gaus_mvgavg (   lists,
  degree = 5 
)

Filters on data sets.

Gaussian moving average smoothing. 

Definition at line 52 of file tools_math.py.

52 def gaus_mvgavg(lists, degree=5):
53  ''' Gaussian moving average smoothing. '''
54 
55  window=degree*2-1
56  weight=np.array([1.0]*window)
57  weightGauss=[]
58  for i in range(window):
59  i=i-degree+1
60  frac=i/float(window)
61  gauss=1/(np.exp((4*(frac))**2))
62  weightGauss.append(gauss)
63  weight=np.array(weightGauss)*weight
64  smoothed=[0.0]*(len(lists)-window)
65  for i in range(len(smoothed)):
66  smoothed[i]=sum(np.array(lists[i:i+window])*weight)/sum(weight)
67  return smoothed
68 
69 
def gaus_mvgavg(lists, degree=5)
Filters on data sets.
Definition: tools_math.py:52

◆ getPercentileKeyFromList()

def lib.tools_math.getPercentileKeyFromList (   items,
  percentile,
  sorting = False,
  sortingColumnIx = 0 
)
Find percentile of a list (optionally, of lists). 

Definition at line 163 of file tools_math.py.

163 def getPercentileKeyFromList(items, percentile, sorting=False, sortingColumnIx=0):
164  ''' Find percentile of a list (optionally, of lists). '''
165 
166  if(sorting):
167  if(type(items[0]) is list): items = sorted(items, key=lambda x: x[sortingColumnIx])
168  else: items = sorted(items)
169  if(len(items) <= 0): return False
170  return items[min(int(round(percentile*(len(items)-1))),len(items)-1)]
171 
def getPercentileKeyFromList(items, percentile, sorting=False, sortingColumnIx=0)
Definition: tools_math.py:163

◆ getPercintileBinFromList()

def lib.tools_math.getPercintileBinFromList (   data,
  percentile,
  sampleSize = 0 
)
This function finds the percentile bin values for a very large list of floats.
    Input data can be either a list or a n-dimension array. 
    
    Input:
    ======
    percentile: float in range of [0,100]
    sampleSize: if non zero, subsample this many observations from data

Definition at line 172 of file tools_math.py.

172 def getPercintileBinFromList(data, percentile, sampleSize=0):
173  ''' This function finds the percentile bin values for a very large list of floats.
174  Input data can be either a list or a n-dimension array.
175 
176  Input:
177  ======
178  percentile: float in range of [0,100]
179  sampleSize: if non zero, subsample this many observations from data
180  '''
181 
182  if(sampleSize): data = sample(data, sampleSize=sampleSize, method='random')
183  return np.percentile(data, percentile)
184 
def getPercintileBinFromList(data, percentile, sampleSize=0)
Definition: tools_math.py:172
def sample(data, sampleSize=10000, method='random')
Definition: tools_math.py:206

◆ getPercintileBinsFromList()

def lib.tools_math.getPercintileBinsFromList (   data,
  bins = 10,
  includeLeftEdge = False,
  sampleSize = 0 
)
This function finds the percentile bin values for a very large list of 
    floats. Input data can be either a list or a n-dimension array. 
    includeLeftEdge returns an additional edge bin at the begining.
    
    Example:
    ========
    >>> getPercintileBinsFromList([1,2,3,4,5,6,7,8,9])
    [1.8, 2.6000000000000001, 3.3999999999999999, 4.2000000000000002, 5.0, 5.7999999999999998, 6.5999999999999996, 7.4000000000000004, 8.1999999999999993, 9]

Definition at line 185 of file tools_math.py.

185 def getPercintileBinsFromList(data, bins=10, includeLeftEdge=False, sampleSize=0):
186  ''' This function finds the percentile bin values for a very large list of
187  floats. Input data can be either a list or a n-dimension array.
188  includeLeftEdge returns an additional edge bin at the begining.
189 
190  Example:
191  ========
192  >>> getPercintileBinsFromList([1,2,3,4,5,6,7,8,9])
193  [1.8, 2.6000000000000001, 3.3999999999999999, 4.2000000000000002, 5.0, 5.7999999999999998, 6.5999999999999996, 7.4000000000000004, 8.1999999999999993, 9]
194 
195  '''
196 
197  binsize = 100.0/bins
198  binsize_incr = 0
199  if(includeLeftEdge): return_list = [np.percentile(data, 0)]
200  else: return_list = []
201  for i in range(bins):
202  binsize_incr += binsize
203  return_list.append(getPercintileBinFromList(data, min(100,binsize_incr), sampleSize=sampleSize))
204  return return_list
205 
def getPercintileBinFromList(data, percentile, sampleSize=0)
Definition: tools_math.py:172
def getPercintileBinsFromList(data, bins=10, includeLeftEdge=False, sampleSize=0)
Definition: tools_math.py:185

◆ ksTest()

def lib.tools_math.ksTest (   df1,
  df2 
)
Kolmogorov-Smirinov test between two distribution functions. 
    Automatically checks for pdf or cdf. 

Definition at line 156 of file tools_math.py.

156 def ksTest(df1, df2):
157  ''' Kolmogorov-Smirinov test between two distribution functions.
158  Automatically checks for pdf or cdf. '''
159  if(round(sum(df1),2) == 1): df1 = pdfToCDF(df1)
160  if(round(sum(df2),2) == 1): df2 = pdfToCDF(df2)
161  return max([abs(x-y) for x,y in zip(df1,df2)])
162 
def ksTest(df1, df2)
Definition: tools_math.py:156
def pdfToCDF(pdf_y)
Statistics.
Definition: tools_math.py:149

◆ local_min_max()

def lib.tools_math.local_min_max (   lists)
Return local min/max
    Suggested using gaus_mvgavg(lists) first
    Returns: maxs,mins,nmax,nmin. 

Definition at line 28 of file tools_math.py.

28 def local_min_max(lists):
29  ''' Return local min/max
30  Suggested using gaus_mvgavg(lists) first
31  Returns: maxs,mins,nmax,nmin. '''
32 
33  gradients=np.diff(lists)
34  maxima_num=0
35  minima_num=0
36  max_locations=[]
37  min_locations=[]
38  count=0
39  for i in gradients[:-1]:
40  count+=1
41  if ((cmp(i,0)>0) & (cmp(gradients[count],0)<0) & (i != gradients[count])):
42  maxima_num+=1
43  max_locations.append(count)
44  if ((cmp(i,0)<0) & (cmp(gradients[count],0)>0) & (i != gradients[count])):
45  minima_num+=1
46  min_locations.append(count)
47  return max_locations, min_locations, maxima_num, minima_num
48 
def local_min_max(lists)
Definition: tools_math.py:28

◆ mat_divide()

def lib.tools_math.mat_divide (   A,
  scalar 
)

Basic math.

Divide list by scalar. 

Definition at line 22 of file tools_math.py.

22 def mat_divide(A, scalar):
23  ''' Divide list by scalar. '''
24 
25  S = [scalar] * len(A)
26  return [[n/d for n, d in zip(rA, rS)] for rA, rS in zip(A, S)]
27 
def mat_divide(A, scalar)
Basic math.
Definition: tools_math.py:22

◆ pdfStatsOnBinnedFreqData()

def lib.tools_math.pdfStatsOnBinnedFreqData (   data,
  centiles = [25 
)
For a list of bins with lists of data points, return mean, median, 
    std. dev. and percentile measures.
    
    http://en.wikipedia.org/wiki/Percentile#Weighted_percentile

Definition at line 254 of file tools_math.py.

254 def pdfStatsOnBinnedFreqData(data, centiles=[25,75]):
255  ''' For a list of bins with lists of data points, return mean, median,
256  std. dev. and percentile measures.
257 
258  http://en.wikipedia.org/wiki/Percentile#Weighted_percentile
259  '''
260 
261  mean_ = [np.mean(x) for x in data]
262  mean = [x/sum(mean_) for x in mean_]
263  median = [np.median(x) for x in data]
264  median = [x/sum(median) for x in median]
265  #Weighted percentile
266  percent_low = [np.percentile(x,centiles[0]) for x in data]
267  percent_low = [x/sum(percent_low) for x in percent_low]
268  percent_high = [np.percentile(x,centiles[1]) for x in data]
269  percent_high = [x/sum(percent_high) for x in percent_high]
270  stddev = [np.std(x) for x in data]
271  stddev = [x/sum(mean_) for x in stddev]
272 
273  #import pdb; pdb.set_trace()
274 
275  return mean, median, percent_low, percent_high, stddev
276 
def pdfStatsOnBinnedFreqData(data, centiles=[25)
Definition: tools_math.py:254

◆ pdfToCDF()

def lib.tools_math.pdfToCDF (   pdf_y)

Statistics.

Convert y values of a PDF distribution (i.e. sum(pdf_y)==1). 

Definition at line 149 of file tools_math.py.

149 def pdfToCDF(pdf_y):
150  ''' Convert y values of a PDF distribution (i.e. sum(pdf_y)==1). '''
151  for pdf_yi in pdf_y:
152  try: cdf.append(cdf[-1]+pdf_yi)
153  except NameError: cdf = [pdf_yi]
154  return cdf
155 
def pdfToCDF(pdf_y)
Statistics.
Definition: tools_math.py:149

◆ reduce_sequence()

def lib.tools_math.reduce_sequence (   sequence)
In a sequence of repeating integers, reduce the repetition by the
    value of that series of repeating integers within the sequence.
    
    Example:
    ========
    >>>reduce_sequence([1,3,3,3,2,2,2,2])
    [1,3,2,2]

Definition at line 103 of file tools_math.py.

103 def reduce_sequence(sequence):
104  ''' In a sequence of repeating integers, reduce the repetition by the
105  value of that series of repeating integers within the sequence.
106 
107  Example:
108  ========
109  >>>reduce_sequence([1,3,3,3,2,2,2,2])
110  [1,3,2,2]
111  '''
112  for i in range(len(sequence)):
113  if(sequence[i] != None):
114  for j in range(sequence[i]):
115  if(j!=0 and i+j+1 <= len(sequence) and sequence[i+j] == sequence[i]):
116  sequence[i+j] = None
117  return filter(None, sequence)
118 
119 
def reduce_sequence(sequence)
Definition: tools_math.py:103

◆ remap_data()

def lib.tools_math.remap_data (   datapoints,
  old_map,
  new_map 
)

Constructors and generators.

Definition at line 133 of file tools_math.py.

133 def remap_data(datapoints, old_map, new_map):
134  scale_multiplier = (new_map[-1]-new_map[0])/(old_map[-1]-old_map[0])
135  for i in range(len(datapoints)):
136  datapoints[i] = new_map[0] + (datapoints[i]-old_map[0])*scale_multiplier
137  return datapoints
138 
139 
def remap_data(datapoints, old_map, new_map)
Constructors and generators.
Definition: tools_math.py:133

◆ running_sum()

def lib.tools_math.running_sum (   a)
Cumulative sum of a list. Returns a generator. Use list(running_sum(a))
    to obtain a list.

Definition at line 120 of file tools_math.py.

120 def running_sum(a):
121  ''' Cumulative sum of a list. Returns a generator. Use list(running_sum(a))
122  to obtain a list.
123  '''
124  tot = 0
125  for item in a:
126  tot += item
127  yield tot
128 
129 
def running_sum(a)
Definition: tools_math.py:120

◆ sample()

def lib.tools_math.sample (   data,
  sampleSize = 10000,
  method = 'random' 
)
Use this function to sub-sample or over-sample a very large list 
    (data).

    Supported sampling methods are:
    ===============================
    random:        randomly chosen using a uniform distribution
    interpolation: interpolated linearly (always returns the same values 
                   for the same list and same sample size). Only supports
                   sub-sampling, by design.

Definition at line 206 of file tools_math.py.

206 def sample(data, sampleSize=10000, method='random'):
207  ''' Use this function to sub-sample or over-sample a very large list
208  (data).
209 
210  Supported sampling methods are:
211  ===============================
212  random: randomly chosen using a uniform distribution
213  interpolation: interpolated linearly (always returns the same values
214  for the same list and same sample size). Only supports
215  sub-sampling, by design.
216  '''
217 
218  if(method=='random'):
219  return_list = []
220  max_index = len(data)-1
221  for i in range(sampleSize):
222  return_list.append(data[int(round(random() * max_index))])
223  return return_list
224  elif(method=='interpolation'):
225  if(sampleSize > len(data)): return data
226  equiv_indeces = np.arange(0,len(data)-1,len(data)/float(sampleSize)).round().astype(int).tolist()
227  return [data[i] for i in equiv_indeces]
228  else: return None
229 
230 
def sample(data, sampleSize=10000, method='random')
Definition: tools_math.py:206

◆ setTheoryUnionNIndEvents()

def lib.tools_math.setTheoryUnionNIndEvents (   events,
  recursionDepthEstimation = 10,
  smallValueLimit = 0.0 
)

Probability theory.

Estimate the probability of union of N independant events using set
    theory, the exclusion-inclusion principle, and bonferroni inequalities
    where events is a list of probabilities for each event
    http://statistics.about.com/od/Formulas/a/Probability-Of-The-Union-Of-Three-Or-More-Sets.htm
    https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
    https://en.wikipedia.org/wiki/Boole%27s_inequality#Bonferroni_inequalities
    
    If recursionDepthEstimation=1, Boole's inequality/2 will be returned.
    
    This calculation is exponentially expensive as the size of the list of
    events increases. For sets larger than 5-10, it is suggestested to
    assume small values are mutually exclusive using simplifySmallValueCalculation
    to reduce the complexity of event intersections.
    
    A hard limit on recursion (recursionDepthEstimation) also exists to avoid
    complete lockup, but this will likely lead to an unpredictable answer.

Definition at line 280 of file tools_math.py.

280 def setTheoryUnionNIndEvents(events, recursionDepthEstimation=10, smallValueLimit=0.0):
281  ''' Estimate the probability of union of N independant events using set
282  theory, the exclusion-inclusion principle, and bonferroni inequalities
283  where events is a list of probabilities for each event
284  http://statistics.about.com/od/Formulas/a/Probability-Of-The-Union-Of-Three-Or-More-Sets.htm
285  https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
286  https://en.wikipedia.org/wiki/Boole%27s_inequality#Bonferroni_inequalities
287 
288  If recursionDepthEstimation=1, Boole's inequality/2 will be returned.
289 
290  This calculation is exponentially expensive as the size of the list of
291  events increases. For sets larger than 5-10, it is suggestested to
292  assume small values are mutually exclusive using simplifySmallValueCalculation
293  to reduce the complexity of event intersections.
294 
295  A hard limit on recursion (recursionDepthEstimation) also exists to avoid
296  complete lockup, but this will likely lead to an unpredictable answer.
297  '''
298  if(type(events) is not list): return None
299  mutExclusiveEvents = []
300  if(smallValueLimit):
301  mutExclusiveEvents = [event for event in events if event <= smallValueLimit]
302  events = [event for event in events if event > smallValueLimit]
303  if(sum(mutExclusiveEvents) > 1): return 1
304  if(len(events) <= 1): return sum(events)+sum(mutExclusiveEvents)
305  probability = 0
306  bonferroni_Inequality_upper = 1-sum(mutExclusiveEvents)
307  bonferroni_Inequality_lower = 0
308 
309 
310  if(len(events) > 20): recursionDepthEstimation = min(recursionDepthEstimation,7)
311  if(len(events) > 60): recursionDepthEstimation = min(recursionDepthEstimation,3)
312  if(len(events) > 250): recursionDepthEstimation = min(recursionDepthEstimation,2)
313  # The number of steps is dependant on the number of events
314  for nIx in range(min(len(events),recursionDepthEstimation)):
315  ''' Calculate the total probability of the intersection of nIx+1 events,
316  across the entire range of events.
317 
318  Example:
319  ========
320  For probabilities A, B, C, D and a set size of 2, get:
321  A*B + A*C + A*D + B*C + B*D + C*D
322  For probabilities A, B, C, D and a set size of 3, get:
323  A*B*C + A*B*D + A*C*D +B*C*D
324  For probabilities A, B, C, D and a set size of 4, get:
325  A*B*C*D
326  '''
327  subProb = sum([reduce(lambda x, y: x*y, comb) for comb in itertools_combinations(events, nIx+1)])
328  #For even steps, add the sum of the sub steps to the final value
329  if(nIx % 2 == 0):
330  probability += subProb
331  bonferroni_Inequality_upper = min(bonferroni_Inequality_upper, probability)
332  #For odd steps, subtract the sum of the sub steps from the final value
333  else:
334  probability -= subProb
335  bonferroni_Inequality_lower = max(bonferroni_Inequality_lower, probability)
336  #if(nIx % 2 == 0): print(str(nIx)+' -> '+str(probability)+' ('+str(subProb)+') upper_bound='+str(bonferroni_Inequality_upper))
337  #else: print(str(nIx)+' -> '+str(probability)+' (-'+str(subProb)+') lower_bound='+str(bonferroni_Inequality_lower))
338  return (bonferroni_Inequality_upper+bonferroni_Inequality_lower)/2.0+sum(mutExclusiveEvents)
339 
def setTheoryUnionNIndEvents(events, recursionDepthEstimation=10, smallValueLimit=0.0)
Probability theory.
Definition: tools_math.py:280