####################### sampler #######################
import numpy as np
def sample():
# returns (x1 ... x5) with some weird logic
# ensures bayes optimal is not too bad
#x2, x3
y = np.random.choice(4, p=[0.3,0.2,0.4,0.1])
x3 = y%2
x2 = int(y/2)
y = np.random.choice(2, p=[0.9, 0.1])
if y==0:
x1 = int(not x3)
else:
x1 = x2
x4 = np.random.choice(2, p=[0.6,0.4])
y = np.random.choice(3, p=[0.07, 0.5, 0.43])
if y == 0:
x5 = 0
elif y == 1:
x5 = int ((not x4) ^ (not x2))
else:
x5 = int (bool(x1) ^ bool(x2) ^ bool(x3))
return [x1,x2,x3,x4,x5]
def fresh(n):
#return n new samples
return tuple([tuple(sample()) for _ in range(n)])
def polish(X):
#removes label column, if it is there
new_X = []
for i in range(len(X)):
if len(X[i]) == 5:
new_X.append(X[i][:-1])
else:
new_X.append(X[i])
return tuple(new_X)
def add_label(x, label):
x = list(x)
x.append(label)
return tuple(x)
####################### FILE IO #######################
def write():
# writes 10k samples
f = open("data.txt", "w")
_len = 10000
for _ in range(_len):
x = sample()
x = ",".join([str(y) for y in x])
if _ < _len-1:
f.write(x + "\n")
else:
f.write(x)
f.close()
def read(filename):
#returns file as list of tuples
f = open(filename, "r")
y = f.read()
f.close()
y_list = y.split("\n")
y_list = [tuple([int(z) for z in y.split(",")])for y in y_list]
return tuple(y_list)
def write_test():
### writes 10 test and their solutions
f = open("test.txt", "w")
_f = open("_sol.txt", "w")
_len = 10
for _ in range(_len):
x = sample()
x = ",".join([str(y) for y in x])
if _ < _len-1:
f.write(x[:-2] + "\n")
_f.write(x + "\n")
else:
f.write(x[:-2])
_f.write(x)
f.close()
_f.close()
####################### MLE #######################
def mle(X):
d = {}
for x in X:
if x in d:
d[x] = d[x]+1.0/len(X)
else:
d[x] = 1.0/(len(X))
return d
def prob(d,x,label):
x = add_label(x, label)
return d[x] if x in d else 0.0
def mle_predict(d,x):
return 0 if prob(d,x,0) >= prob(d,x,1) else 1
def bayes_predict(d, x):
p = [0.07, 0] #with prob 7% x5 = 0
outcome = int ((not x[3]) ^ (not x[1]))
p[outcome] = p[outcome] + 0.5
outcome = int (bool(x[0]) ^ bool(x[1]) ^ bool(x[2]))
p[outcome] = p[outcome] + 0.43
assert(p[0] + p[1] == 1.0)
return 0 if(p[0] > p[1]) else 1
def test(d, X, predictor):
X = polish(X)
labels = [predictor(d,x) for x in X]
return list(zip(X, labels))
def empirical_error(X, X_hat):
y = np.array([x[-1] for x in X])
y_hat = np.array([x_hat[-1] for x_hat in X_hat])
return np.sum(y != y_hat)/len(X_hat)
from pprint import pprint
X = read("data.txt")
X_train = X[:4000]
X_validate = X[8000:]
X_1 = fresh(100)
X_2 = fresh(1000)
d = mle(X_1)
#Z = read("test.txt")
#pprint(test(d, Z))
print(empirical_error(X_1, test(d,X_1, mle_predict)))
print(empirical_error(X_train, test(d,X_train, mle_predict)))
print(empirical_error(X_2, test(d,X_2, mle_predict)))
print(empirical_error(X_1, test(d,X_1, bayes_predict)))
print(empirical_error(X_train, test(d,X_train, bayes_predict)))
print(empirical_error(X_2, test(d,X_2, bayes_predict)))
from pprint import pprint
X = fresh(1)
X_test = fresh(1000)
d = mle(X)
pprint(empirical_error(X_test, test(d,X_test, mle_predict)))