'''
Implementation of substring expected time to occurrence and precedence probabilities,
according to `(Li 1980) <http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aop/1176994578>`_.

Copyright 2013, `Konstantin Tretyakov <http://fouryears.eu/2013/02/24/the-substring-paradox/>`_

License: MIT License.
'''

def li_product(a, b, alphabet_size=2):
    "(Li 1980), Definition 2.2."
    
    delta = lambda i,j: alphabet_size if a[i] == b[j] else 0
    m = len(a)
    mul_with_step = lambda k: prod([delta(i+k, i)  for i in range(0, m-k)])
    return sum([mul_with_step(k) for k in range(0, m)])

def expected_time_to_occurrence(a, alphabet_size=2):
    "(Li 1980), Lemma 2.4."
    
    return li_product(a, a, alphabet_size)

def probability_to_precede(b, a, n=2):
    "(Li 1980), Corollary 3.2."
    
    o = float(li_product(a,a,n) - li_product(a,b,n))/(li_product(b,b,n) - li_product(b,a,n))
    return o / (1+o)

def conway_leading_number(a,b):
    "(Li 1980), Equation (4.1)."
    
    e = lambda i: int(a[(-i):] == b[0:i])
    return sum([e(i)*(2**(i-1)) for i in range(1,len(b)+1)])

def conway_probability_to_precede(b,a):
    "(Li 1980), Theorem 4.1."
    
    o = float(conway_leading_number(a,a) - conway_leading_number(a,b))/(conway_leading_number(b,b) - conway_leading_number(b,a))
    return o / (1+o)

def empirical_time_to_occurrence(A, B, l=500, n=10000, alphabet=['0','1']):
    "Empirical measurement of expected time to occurrence and probability to precede."
    
    def test():
        seq = ''.join([alphabet[int(x>0.5)] for x in rand(l)]) # Random string 
        return (seq.index(A), seq.index(B))  # First position of string A, first position of string B
    tests = [test() for i in range(n)]
    return (mean([a for (a,b) in tests]) + len(A), mean([b for (a,b) in tests]) + len(B), mean([int(a < b) for (a,b) in tests]))

def stats(a, b, n=1000):
    (t_a, t_b, p) = empirical_time_to_occurrence(a, b, l=500, n=n, alphabet=sorted(set(a + b)))
    print "A = %s\tB = %s" % (a, b)
    print "T(A),     analytical|empirical    = %0.6f | %0.6f" % (expected_time_to_occurrence(a), t_a)
    print "T(B),     analytical|empirical    = %0.6f | %0.6f" % (expected_time_to_occurrence(b), t_b)
    print "P(A < B), Li1980|Conway|empirical = %0.6f | %0.6f | %0.6f" % (probability_to_precede(a, b),conway_probability_to_precede(a, b), p)

# First example
print "------ First example --------"
stats('HHT', 'HHH')

print "------ Second example --------"
# Find pairs of 4 coin flips which are "illogical"
from itertools import product
all_sequences = [''.join(p) for p in product(['H','T'], repeat=4)]
all_pairs = [(a,b) for (a,b) in product(all_sequences, all_sequences) \
            if (expected_time_to_occurrence(a) > expected_time_to_occurrence(b)) and (probability_to_precede(a,b) > 0.5)]
for (a, b) in all_pairs:
    print a, b, expected_time_to_occurrence(a), expected_time_to_occurrence(b), probability_to_precede(a, b)

print "------ Third example --------" 
s = ['THH', 'HHT', 'HTT', 'TTH']
for i in range(len(s)):
    stats(s[i-1], s[i], n=100)
