import math
def load_dataset():
dataset = []
with open("dataset","r") as f:
for line in f:
try:
label, email = line.split('\t')
email = email.replace("\n","").replace("."," ").replace(","," ").replace("?"," ").replace("!"," ").replace("`"," ").replace("'"," ").lower().split(" ")
dataset.append([email, label])
except: pass
return dataset
dataset = load_dataset()
print(dataset[0])
def compute_class_probabilities(dataset):
class_probabilities = {}
class_counts = {}
# Insert code here
# class_counts[y] should be number of times y occurs in the dataset
# class_probabilities[y] should be MLE for p(y)
return class_counts, class_probabilities
class_counts, class_probabilities = compute_class_probabilities(dataset[:4000])
print(class_probabilities)
def build_vocabulary(dataset):
vocabulary = []
# Insert code here
# vocabulary should be list of all words in dataset
return vocabulary
vocabulary = build_vocabulary(dataset[:4000])
def compute_word_probabilities(dataset, vocabulary, class_counts):
word_counts = {} #{"spam":{}, "ham":{}}
word_probabilities = {} #{"spam":{}, "ham":{}}
# Insert code here
# word_counts[y][word] should be number of times word v occurs for e-mails of class y
# word_probabilities[y][word] should be MLE for p(word|y)
return word_counts, word_probabilities
word_counts, word_probabilities = compute_word_probabilities(dataset[:4000], vocabulary, class_counts)
print(word_probabilities)
def classify(email, vocabulary, class_probabilities, word_probabilities):
posterior_logprobs = {}
# Insert code here
# posterior_probs[y] should be log p(email, y) = log p(y) + log p(email|y)
# Prediction should be argmax_y log p(email, y)
return posterior_probs, prediction
posterior_probs, prediction = classify(dataset[0][0], vocabulary, class_probabilities, word_probabilities)
print(dataset[0][0], posterior_probs, prediction)
def evaluate_model(dataset, vocabulary, class_probabilities, word_probabilities):
num_correct = 0
# Insert code here
# num_correct should be number of times prediction matches the e-mails class
return num_correct / len(dataset)
accuracy = evaluate_model(dataset[:4000], vocabulary, class_probabilities, word_probabilities)
print(accuracy)
accuracy = evaluate_model(dataset[4000:], vocabulary, class_probabilities, word_probabilities)
print(accuracy)
# Insert code here
# most_spam should be sorted list of words, according to p(word|spam)
print(most_spam[-50:])
# Insert code here
# most_ham should be sorted list of words, according to p(word|not spam)
print(most_ham[-50:])
# Insert code here
# most_spam_ratio should be sorted list of words, according to p(word|spam) / p(word|not spam)
print(most_spam_ratio[-50:])