tvaLib
tools_math.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # tvaLib Copyright (c) 2012-2016 Paul G. St-Aubin
3 # Ecole Polytechnique de Montreal, McGill University
4 # Python 2.7; (dt) Spyder Windows 10 64-bit; ipython Ubuntu 15.04 64-bit
5 
9 from copy import deepcopy
10 import math as m
11 from random import random
12 from random import uniform as random_uniform
13 from random import randint as random_randint
14 from itertools import combinations as itertools_combinations
15 
16 try: import numpy as np
17 except Exception: raise Exception, [0101, 'Numpy is not installed.']
18 
19 
22 def mat_divide(A, scalar):
23  ''' Divide list by scalar. '''
24 
25  S = [scalar] * len(A)
26  return [[n/d for n, d in zip(rA, rS)] for rA, rS in zip(A, S)]
27 
28 def local_min_max(lists):
29  ''' Return local min/max
30  Suggested using gaus_mvgavg(lists) first
31  Returns: maxs,mins,nmax,nmin. '''
32 
33  gradients=np.diff(lists)
34  maxima_num=0
35  minima_num=0
36  max_locations=[]
37  min_locations=[]
38  count=0
39  for i in gradients[:-1]:
40  count+=1
41  if ((cmp(i,0)>0) & (cmp(gradients[count],0)<0) & (i != gradients[count])):
42  maxima_num+=1
43  max_locations.append(count)
44  if ((cmp(i,0)<0) & (cmp(gradients[count],0)>0) & (i != gradients[count])):
45  minima_num+=1
46  min_locations.append(count)
47  return max_locations, min_locations, maxima_num, minima_num
48 
49 
52 def gaus_mvgavg(lists, degree=5):
53  ''' Gaussian moving average smoothing. '''
54 
55  window=degree*2-1
56  weight=np.array([1.0]*window)
57  weightGauss=[]
58  for i in range(window):
59  i=i-degree+1
60  frac=i/float(window)
61  gauss=1/(np.exp((4*(frac))**2))
62  weightGauss.append(gauss)
63  weight=np.array(weightGauss)*weight
64  smoothed=[0.0]*(len(lists)-window)
65  for i in range(len(smoothed)):
66  smoothed[i]=sum(np.array(lists[i:i+window])*weight)/sum(weight)
67  return smoothed
68 
69 
70 def cat_mvgavg(cat_list, window=2, passes=1):
71  ''' Return a list of categories/values smoothed according to a window.
72  window is the search radius on either side'''
73  for pa in range(passes):
74  smoothed = deepcopy(cat_list)
75  for point in range(len(cat_list)):
76  lower_bound_check = max(0,point-window)
77  upper_bound_check = min(len(cat_list)-1,point+window+1)
78  window_values = cat_list[lower_bound_check:upper_bound_check]
79  smoothed[point] = max(set(window_values), key=window_values.count)
80  cat_list = smoothed
81  return cat_list
82 
83 
84 def cat_curvy_mvgavg(cat_list, curvy, window=2.0, passes=1):
85  ''' Similar to cat_mvgavg(), however, the window is treated continously
86  using curvy position.
87 
88  Note:
89  =====
90  curvy should have the same dimension as cat_list
91  '''
92  for pa in range(passes):
93  smoothed = deepcopy(cat_list)
94  for point in range(len(cat_list)):
95  lower_bound_check = min(range(len(curvy)), key=lambda x:abs(curvy[x]-(curvy[point]-window)))
96  upper_bound_check = min(range(len(curvy)), key=lambda x:abs(curvy[x]-(curvy[point]+window)))+1
97  window_values = cat_list[lower_bound_check:upper_bound_check]
98  smoothed[point] = max(set(window_values), key=window_values.count)
99  cat_list = smoothed
100  return cat_list
101 
102 
103 def reduce_sequence(sequence):
104  ''' In a sequence of repeating integers, reduce the repetition by the
105  value of that series of repeating integers within the sequence.
106 
107  Example:
108  ========
109  >>>reduce_sequence([1,3,3,3,2,2,2,2])
110  [1,3,2,2]
111  '''
112  for i in range(len(sequence)):
113  if(sequence[i] != None):
114  for j in range(sequence[i]):
115  if(j!=0 and i+j+1 <= len(sequence) and sequence[i+j] == sequence[i]):
116  sequence[i+j] = None
117  return filter(None, sequence)
118 
119 
120 def running_sum(a):
121  ''' Cumulative sum of a list. Returns a generator. Use list(running_sum(a))
122  to obtain a list.
123  '''
124  tot = 0
125  for item in a:
126  tot += item
127  yield tot
128 
129 
130 
133 def remap_data(datapoints, old_map, new_map):
134  scale_multiplier = (new_map[-1]-new_map[0])/(old_map[-1]-old_map[0])
135  for i in range(len(datapoints)):
136  datapoints[i] = new_map[0] + (datapoints[i]-old_map[0])*scale_multiplier
137  return datapoints
138 
139 
140 def cantorID(obj):
141  ''' Cantor unique ID for 2 objects. '''
142 
143  return int((obj[0]+obj[1])*(obj[0]+obj[1]+1)*.5+obj[1])
144 
145 
146 
149 def pdfToCDF(pdf_y):
150  ''' Convert y values of a PDF distribution (i.e. sum(pdf_y)==1). '''
151  for pdf_yi in pdf_y:
152  try: cdf.append(cdf[-1]+pdf_yi)
153  except NameError: cdf = [pdf_yi]
154  return cdf
155 
156 def ksTest(df1, df2):
157  ''' Kolmogorov-Smirinov test between two distribution functions.
158  Automatically checks for pdf or cdf. '''
159  if(round(sum(df1),2) == 1): df1 = pdfToCDF(df1)
160  if(round(sum(df2),2) == 1): df2 = pdfToCDF(df2)
161  return max([abs(x-y) for x,y in zip(df1,df2)])
162 
163 def getPercentileKeyFromList(items, percentile, sorting=False, sortingColumnIx=0):
164  ''' Find percentile of a list (optionally, of lists). '''
165 
166  if(sorting):
167  if(type(items[0]) is list): items = sorted(items, key=lambda x: x[sortingColumnIx])
168  else: items = sorted(items)
169  if(len(items) <= 0): return False
170  return items[min(int(round(percentile*(len(items)-1))),len(items)-1)]
171 
172 def getPercintileBinFromList(data, percentile, sampleSize=0):
173  ''' This function finds the percentile bin values for a very large list of floats.
174  Input data can be either a list or a n-dimension array.
175 
176  Input:
177  ======
178  percentile: float in range of [0,100]
179  sampleSize: if non zero, subsample this many observations from data
180  '''
181 
182  if(sampleSize): data = sample(data, sampleSize=sampleSize, method='random')
183  return np.percentile(data, percentile)
184 
185 def getPercintileBinsFromList(data, bins=10, includeLeftEdge=False, sampleSize=0):
186  ''' This function finds the percentile bin values for a very large list of
187  floats. Input data can be either a list or a n-dimension array.
188  includeLeftEdge returns an additional edge bin at the begining.
189 
190  Example:
191  ========
192  >>> getPercintileBinsFromList([1,2,3,4,5,6,7,8,9])
193  [1.8, 2.6000000000000001, 3.3999999999999999, 4.2000000000000002, 5.0, 5.7999999999999998, 6.5999999999999996, 7.4000000000000004, 8.1999999999999993, 9]
194 
195  '''
196 
197  binsize = 100.0/bins
198  binsize_incr = 0
199  if(includeLeftEdge): return_list = [np.percentile(data, 0)]
200  else: return_list = []
201  for i in range(bins):
202  binsize_incr += binsize
203  return_list.append(getPercintileBinFromList(data, min(100,binsize_incr), sampleSize=sampleSize))
204  return return_list
205 
206 def sample(data, sampleSize=10000, method='random'):
207  ''' Use this function to sub-sample or over-sample a very large list
208  (data).
209 
210  Supported sampling methods are:
211  ===============================
212  random: randomly chosen using a uniform distribution
213  interpolation: interpolated linearly (always returns the same values
214  for the same list and same sample size). Only supports
215  sub-sampling, by design.
216  '''
217 
218  if(method=='random'):
219  return_list = []
220  max_index = len(data)-1
221  for i in range(sampleSize):
222  return_list.append(data[int(round(random() * max_index))])
223  return return_list
224  elif(method=='interpolation'):
225  if(sampleSize > len(data)): return data
226  equiv_indeces = np.arange(0,len(data)-1,len(data)/float(sampleSize)).round().astype(int).tolist()
227  return [data[i] for i in equiv_indeces]
228  else: return None
229 
230 
231 def combineMean(means, samples):
232  ''' Combine various means knowing their sample size. AKA Weighted average.
233  The order of each list must be consistent.
234  '''
235  try: return sum([mean*sample for mean,sample in zip(means,samples)])/float(sum(samples))
236  except ZeroDivisionError: return 0
237 
238 def combineVariance(variances, means, samples):
239  ''' http://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
240  The order of each list must be consistent.
241  '''
242  Xc = combineMean(means, samples)
243  try: return (sum([variance*sample for variance,sample in zip(variances,samples)])+sum([m.pow(mean-Xc,2)*sample for mean,sample in zip(means,samples)]))/float(sum(samples))
244  except ZeroDivisionError: return 0
245 
246 def combineStdDev(stddevs, means, samples):
247  ''' http://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
248  The order of each list must be consistent.
249  '''
250  return m.sqrt(combineVariance([m.pow(x,2) for x in stddevs], means, samples))
251 
252 
253 
254 def pdfStatsOnBinnedFreqData(data, centiles=[25,75]):
255  ''' For a list of bins with lists of data points, return mean, median,
256  std. dev. and percentile measures.
257 
258  http://en.wikipedia.org/wiki/Percentile#Weighted_percentile
259  '''
260 
261  mean_ = [np.mean(x) for x in data]
262  mean = [x/sum(mean_) for x in mean_]
263  median = [np.median(x) for x in data]
264  median = [x/sum(median) for x in median]
265  #Weighted percentile
266  percent_low = [np.percentile(x,centiles[0]) for x in data]
267  percent_low = [x/sum(percent_low) for x in percent_low]
268  percent_high = [np.percentile(x,centiles[1]) for x in data]
269  percent_high = [x/sum(percent_high) for x in percent_high]
270  stddev = [np.std(x) for x in data]
271  stddev = [x/sum(mean_) for x in stddev]
272 
273  #import pdb; pdb.set_trace()
274 
275  return mean, median, percent_low, percent_high, stddev
276 
277 
280 def setTheoryUnionNIndEvents(events, recursionDepthEstimation=10, smallValueLimit=0.0):
281  ''' Estimate the probability of union of N independant events using set
282  theory, the exclusion-inclusion principle, and bonferroni inequalities
283  where events is a list of probabilities for each event
284  http://statistics.about.com/od/Formulas/a/Probability-Of-The-Union-Of-Three-Or-More-Sets.htm
285  https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
286  https://en.wikipedia.org/wiki/Boole%27s_inequality#Bonferroni_inequalities
287 
288  If recursionDepthEstimation=1, Boole's inequality/2 will be returned.
289 
290  This calculation is exponentially expensive as the size of the list of
291  events increases. For sets larger than 5-10, it is suggestested to
292  assume small values are mutually exclusive using simplifySmallValueCalculation
293  to reduce the complexity of event intersections.
294 
295  A hard limit on recursion (recursionDepthEstimation) also exists to avoid
296  complete lockup, but this will likely lead to an unpredictable answer.
297  '''
298  if(type(events) is not list): return None
299  mutExclusiveEvents = []
300  if(smallValueLimit):
301  mutExclusiveEvents = [event for event in events if event <= smallValueLimit]
302  events = [event for event in events if event > smallValueLimit]
303  if(sum(mutExclusiveEvents) > 1): return 1
304  if(len(events) <= 1): return sum(events)+sum(mutExclusiveEvents)
305  probability = 0
306  bonferroni_Inequality_upper = 1-sum(mutExclusiveEvents)
307  bonferroni_Inequality_lower = 0
308 
309 
310  if(len(events) > 20): recursionDepthEstimation = min(recursionDepthEstimation,7)
311  if(len(events) > 60): recursionDepthEstimation = min(recursionDepthEstimation,3)
312  if(len(events) > 250): recursionDepthEstimation = min(recursionDepthEstimation,2)
313  # The number of steps is dependant on the number of events
314  for nIx in range(min(len(events),recursionDepthEstimation)):
315  ''' Calculate the total probability of the intersection of nIx+1 events,
316  across the entire range of events.
317 
318  Example:
319  ========
320  For probabilities A, B, C, D and a set size of 2, get:
321  A*B + A*C + A*D + B*C + B*D + C*D
322  For probabilities A, B, C, D and a set size of 3, get:
323  A*B*C + A*B*D + A*C*D +B*C*D
324  For probabilities A, B, C, D and a set size of 4, get:
325  A*B*C*D
326  '''
327  subProb = sum([reduce(lambda x, y: x*y, comb) for comb in itertools_combinations(events, nIx+1)])
328  #For even steps, add the sum of the sub steps to the final value
329  if(nIx % 2 == 0):
330  probability += subProb
331  bonferroni_Inequality_upper = min(bonferroni_Inequality_upper, probability)
332  #For odd steps, subtract the sum of the sub steps from the final value
333  else:
334  probability -= subProb
335  bonferroni_Inequality_lower = max(bonferroni_Inequality_lower, probability)
336  #if(nIx % 2 == 0): print(str(nIx)+' -> '+str(probability)+' ('+str(subProb)+') upper_bound='+str(bonferroni_Inequality_upper))
337  #else: print(str(nIx)+' -> '+str(probability)+' (-'+str(subProb)+') lower_bound='+str(bonferroni_Inequality_lower))
338  return (bonferroni_Inequality_upper+bonferroni_Inequality_lower)/2.0+sum(mutExclusiveEvents)
339 
340 
344  def __init__(self, parameters, populationSize):
345  ''' Initialise a genetic algorythm pool
346 
347  Input:
348  ======
349  parameters = {'name': [min,max,type], ...}
350  '''
351  self.populationSize = populationSize
352  self.parameters = parameters
353  self.chromosomes = []
355  self.linearIter = 0
356 
357  for i in range(self.populationSize):
358  spawn = self.spawn()
359  self.chromosomes.append({'params': spawn, 'score': None, 'code': self.encode(spawn)})
360  return
361 
362  def __repr__(self): return 'GA population '+str(self.chromosomes)
363  def __str__(self): return 'GA population '+str(self.chromosomes)
364  def __len__(self): return len(self.chromosomes)
365  def __getitem__(self, i): return self.chromosomes[i]
366  def __iter__(self): return iter(self.chromosomes)
367  def append(self,value): return self.chromosomes.append(value)
368  def reverse(self): self.chromosomes.reverse();return None
369  def count(self, value): return self.chromosomes.count(value)
370 
371  def setScore(self, Ix, value): self.chromosomes[Ix]['score'] = value;return None
372  def orderChromosomes(self, r=True): self.chromosomes.sort(key=lambda x: x['score'], reverse=r)
373 
374  def spawn(self):
375  ''' If only one parameter is set, this will spawn chromosomes linearly,
376  otherwise, selection will be random.
377 
378  Linear search may be usefull for sensitivity analysis.
379  '''
380  if(len(self.parameters)==1):
381  parameterName = self.parameters.keys()[0]
382  self.linearIter += 1
383  if(self.parameters[parameterName][2] == float): return {parameterName: self.parameters[parameterName][0]+(self.parameters[parameterName][1]-self.parameters[parameterName][0])*(float((self.linearIter-1)%self.populationSize)/self.populationSize)}
384  elif(self.parameters[parameterName][2] == int): return {parameterName: range(self.parameters[parameterName][0],self.parameters[parameterName][1]+1)[(self.linearIter)%(self.parameters[parameterName][1]-self.parameters[parameterName][0])]}
385 
386  spawn = {}
387  for parameterName in self.parameters:
388  if(self.parameters[parameterName][2] == float): spawn[parameterName] = random_uniform(self.parameters[parameterName][0],self.parameters[parameterName][1])
389  elif(self.parameters[parameterName][2] == int): spawn[parameterName] = random_randint(self.parameters[parameterName][0],self.parameters[parameterName][1])
390  return spawn
391 
392  def encode(self, spawn):
393  ''' Encode parameters into GA code. '''
394  code = ''
395  for parameterName in self.parameters:
396  if(self.parameters[parameterName][2] == float): code += format(int(spawn[parameterName]*10000), '020b')
397  elif(self.parameters[parameterName][2] == int): code += format(spawn[parameterName], '020b')
398 
399  return code
400 
401  def decode(self, code):
402  ''' Decode parameters from GA code. TODO: '''
403  spawn = {}
404  return spawn
405 
406  def naturalSelection(self, recipe=0, scoreThreshold=0.2):
407 
408  if(recipe==0):
409  #Natural Death
410  nd = self.naturalDeath(scoreThreshold=scoreThreshold)
411  #Roulette Wheel Selection
412  rs = self.rouletteWheelSelection(survivorRate=0.6)
413  #Crossover
414  co = self.crossover(repopShare=0.6)
415  #Mutation
416  mu = self.mutation(mutationRate=0.05)
417  #Repopulate remainder with spawns
418  i=-1
419  for i in range(self.populationSize-len(self.chromosomes)):
420  spawn = self.spawn()
421  self.chromosomes.append({'params': spawn, 'score': None, 'code': self.encode(spawn)})
422 
423  self.nsStats = {'nd':nd, 'rs':rs, 'co':co, 'mu':mu, 'spawn':i+1}
424  return True
425 
426  def naturalSelectionStats(self, justification=3):
427  try: return 'Nat.deaths: '+str(self.nsStats['nd']).ljust(justification)+' Unfit: '+str(self.nsStats['rs']).ljust(justification)+' Crossover: '+str(self.nsStats['co']).ljust(justification)+' Mutation: '+str(self.nsStats['mu']).ljust(justification)+' Spawn: '+str(self.nsStats['spawn']).ljust(justification)
428  except NameError: return 'Natural selection hasn\'t been performed yet.'
429 
430 
431  def naturalDeath(self, scoreThreshold=0):
432  ''' Kill off any specimens with a score of 0. These specimens do not
433  participate in any selection.
434  '''
435  j = 0
436  for i in range(len(self.chromosomes)):
437  if(self.fitnessMaximiseScore and self.chromosomes[i]['score'] <= scoreThreshold):
438  self.chromosomes[i] = None
439  j += 1
440  elif(not self.fitnessMaximiseScore and self.chromosomes[i]['score'] >= scoreThreshold):
441  self.chromosomes[i] = None
442  j += 1
443  self.chromosomes = filter(None, self.chromosomes)
444  return j
445 
446  def crossover(self, repopShare=0.6):
447  ''' Repopulate dead population using most fit specimens. Repopulation
448  caps at a share percentage of dead population (rounded down).
449  '''
450  self.orderChromosomes(r=True)
451  self.crossovers = []
452  remainder = self.populationSize - len(self.chromosomes)
453  i=0
454  while len(self.crossovers) < int(remainder*repopShare):
455  try:
456  spawn = {}
457  for parameterName in self.chromosomes[i]['params']:
458  if(self.chromosomes[i]['params'][parameterName] > self.chromosomes[i+1]['params'][parameterName]):
459  minParam = self.chromosomes[i+1]['params'][parameterName]
460  maxParam = self.chromosomes[i]['params'][parameterName]
461  else:
462  minParam = self.chromosomes[i]['params'][parameterName]
463  maxParam = self.chromosomes[i+1]['params'][parameterName]
464  if(self.parameters[parameterName][2] == float): spawn[parameterName] = random_uniform(minParam,maxParam)
465  elif(self.parameters[parameterName][2] == int): spawn[parameterName] = random_randint(minParam,maxParam)
466  self.crossovers.append({'params': spawn, 'score': None, 'code': self.encode(spawn)})
467  except: pass
468  i += 1
469  if(i > 10000): break
470  self.chromosomes += self.crossovers
471  return len(self.crossovers)
472 
473  def rouletteWheelSelection(self, survivorRate=0.6):
474  ''' Kill off a percentage of the population (1-survivorRate) by reverse
475  order of fitness.
476  '''
477  self.orderChromosomes(r=True)
478  totalScore = sum([x['score'] for x in self.chromosomes])
479  if(totalScore==0):
480  self.chromosomes = []
481  return True
482  self.survivors = []
483  i=0
484  while len(self.survivors) < int(len(self.chromosomes) * survivorRate):
485  m=i%len(self.chromosomes)
486  if(self.chromosomes[m] and self.chromosomes[m]['score']/totalScore > random()):
487  self.survivors.append(self.chromosomes[m])
488  self.chromosomes[m] = None
489  i += 1
490  if(i > 10000): break
491  self.chromosomes = self.survivors
492  return len(self.survivors)
493 
494  def mutation(self, mutationRate=0.05):
495  j = 0
496  for i in range(len(self.chromosomes)):
497  for parameterName in self.chromosomes[i]['params']:
498  if(mutationRate > random()):
499  parameterName
500  if(self.parameters[parameterName][2] == float): self.chromosomes[i]['params'][parameterName] = random_uniform(self.parameters[parameterName][0],self.parameters[parameterName][1])
501  elif(self.parameters[parameterName][2] == int): self.chromosomes[i]['params'][parameterName] = random_randint(self.parameters[parameterName][0],self.parameters[parameterName][1])
502  j += 1
503  return j
504 
505 
506 
507 
def combineStdDev(stddevs, means, samples)
Definition: tools_math.py:246
def cantorID(obj)
Definition: tools_math.py:140
def running_sum(a)
Definition: tools_math.py:120
def reduce_sequence(sequence)
Definition: tools_math.py:103
def crossover(self, repopShare=0.6)
Definition: tools_math.py:446
def naturalDeath(self, scoreThreshold=0)
Definition: tools_math.py:431
def local_min_max(lists)
Definition: tools_math.py:28
def getPercintileBinFromList(data, percentile, sampleSize=0)
Definition: tools_math.py:172
def encode(self, spawn)
Definition: tools_math.py:392
def getPercintileBinsFromList(data, bins=10, includeLeftEdge=False, sampleSize=0)
Definition: tools_math.py:185
def decode(self, code)
Definition: tools_math.py:401
def pdfStatsOnBinnedFreqData(data, centiles=[25)
Definition: tools_math.py:254
def getPercentileKeyFromList(items, percentile, sorting=False, sortingColumnIx=0)
Definition: tools_math.py:163
def naturalSelectionStats(self, justification=3)
Definition: tools_math.py:426
def gaus_mvgavg(lists, degree=5)
Filters on data sets.
Definition: tools_math.py:52
def remap_data(datapoints, old_map, new_map)
Constructors and generators.
Definition: tools_math.py:133
def mat_divide(A, scalar)
Basic math.
Definition: tools_math.py:22
def cat_curvy_mvgavg(cat_list, curvy, window=2.0, passes=1)
Definition: tools_math.py:84
def combineVariance(variances, means, samples)
Definition: tools_math.py:238
def __init__(self, parameters, populationSize)
Definition: tools_math.py:344
def combineMean(means, samples)
Definition: tools_math.py:231
def ksTest(df1, df2)
Definition: tools_math.py:156
def count(self, value)
Definition: tools_math.py:369
def append(self, value)
Definition: tools_math.py:367
def rouletteWheelSelection(self, survivorRate=0.6)
Definition: tools_math.py:473
def __getitem__(self, i)
Definition: tools_math.py:365
def cat_mvgavg(cat_list, window=2, passes=1)
Definition: tools_math.py:70
def sample(data, sampleSize=10000, method='random')
Definition: tools_math.py:206
def setTheoryUnionNIndEvents(events, recursionDepthEstimation=10, smallValueLimit=0.0)
Probability theory.
Definition: tools_math.py:280
def pdfToCDF(pdf_y)
Statistics.
Definition: tools_math.py:149
def mutation(self, mutationRate=0.05)
Definition: tools_math.py:494
def orderChromosomes(self, r=True)
Definition: tools_math.py:372
def naturalSelection(self, recipe=0, scoreThreshold=0.2)
Definition: tools_math.py:406
def setScore(self, Ix, value)
Definition: tools_math.py:371