####################### sampler #######################

import numpy as np


def sample():
    # returns (x1 ... x5) with some weird logic
    # ensures bayes optimal is not too bad
    #x2, x3
    y = np.random.choice(4, p=[0.3,0.2,0.4,0.1])
    x3 = y%2
    x2 = int(y/2)
    
    y = np.random.choice(2, p=[0.9, 0.1])
    if y==0:
        x1 = int(not x3)
    else:
        x1 = x2
        
    x4 = np.random.choice(2, p=[0.6,0.4])
    
    y = np.random.choice(3, p=[0.07, 0.5, 0.43])
    if y == 0:
        x5 = 0
    elif y == 1:
        x5 = int ((not x4) ^ (not x2))
    else:
        x5 = int (bool(x1) ^ bool(x2) ^ bool(x3))
    
    return [x1,x2,x3,x4,x5]

def fresh(n):
    #return n new samples
    return tuple([tuple(sample()) for _ in range(n)])

def polish(X):
    #removes label column, if it is there
    new_X = []
    for i in range(len(X)):
        if len(X[i]) == 5:
            new_X.append(X[i][:-1])
        else:
            new_X.append(X[i])
    return tuple(new_X)

def add_label(x, label):
    x = list(x)
    x.append(label)
    return tuple(x)

####################### FILE IO #######################

def write():
    # writes 10k samples
    f = open("data.txt", "w")
    _len = 10000

    for _ in range(_len):
        x = sample()
        x = ",".join([str(y) for y in  x])
        if _ < _len-1:
            f.write(x + "\n")
        else:
            f.write(x)
    f.close()

def read(filename):
    #returns file as list of tuples
    f = open(filename, "r")
    y = f.read()
    f.close()
    
    y_list = y.split("\n")
    y_list = [tuple([int(z) for z in y.split(",")])for y in y_list]
    return tuple(y_list)

def write_test():
    ### writes 10 test and their solutions
    f = open("test.txt", "w")
    _f = open("_sol.txt", "w")
    _len = 10

    for _ in range(_len):
        x = sample()
        x = ",".join([str(y) for y in  x])
        if _ < _len-1:
            f.write(x[:-2] + "\n")
            _f.write(x + "\n")
        else:
            f.write(x[:-2])
            _f.write(x)

    f.close()
    _f.close()

####################### MLE #######################
def mle(X):
    d = {}
    for x in X:
        if x in d:
            d[x] = d[x]+1.0/len(X)
        else:
            d[x] = 1.0/(len(X))
    return d

def prob(d,x,label):
    x = add_label(x, label)
    return d[x] if x in d else 0.0

def mle_predict(d,x):
    return 0 if prob(d,x,0) >= prob(d,x,1) else 1

def bayes_predict(d, x):
    p = [0.07, 0]  #with prob 7% x5 = 0
    outcome = int ((not x[3]) ^ (not x[1]))
    p[outcome] = p[outcome] + 0.5
    outcome = int (bool(x[0]) ^ bool(x[1]) ^ bool(x[2]))
    p[outcome] = p[outcome] + 0.43
    assert(p[0] + p[1] == 1.0)
    return 0 if(p[0] > p[1]) else 1
    
    

def test(d, X, predictor):
    X = polish(X)
    labels = [predictor(d,x) for x in X]
    return list(zip(X, labels))

def empirical_error(X, X_hat):
    y = np.array([x[-1] for x in X])
    y_hat = np.array([x_hat[-1] for x_hat in X_hat])
    return np.sum(y != y_hat)/len(X_hat)

from pprint import pprint
X = read("data.txt")
X_train = X[:4000]
X_validate = X[8000:]

X_1 = fresh(100)
X_2 = fresh(1000)

d = mle(X_1)

#Z = read("test.txt")
#pprint(test(d, Z))

print(empirical_error(X_1, test(d,X_1,  mle_predict)))
print(empirical_error(X_train, test(d,X_train,  mle_predict)))
print(empirical_error(X_2, test(d,X_2,  mle_predict)))


print(empirical_error(X_1, test(d,X_1,  bayes_predict)))
print(empirical_error(X_train, test(d,X_train,  bayes_predict)))
print(empirical_error(X_2, test(d,X_2,  bayes_predict)))

0.29
0.32725
0.314
0.34
0.2885
0.297

from pprint import pprint
X = fresh(1)

X_test = fresh(1000)

d = mle(X)

pprint(empirical_error(X_test, test(d,X_test,  mle_predict)))

0.483