Texts K-Nearest Neighbor (KNN) using the Euclidean algorithm
Cheers and Happy coding.
import cPickle
import re
from math import sqrt
class Words_Works():
def __init__(self):
self.all_texts = {}
self.categories = {}
self.knn_results = {}
self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']
self.stop_words = ['a', 'i', 'it', 'am', 'at', 'on', 'in', 'of', 'to', 'is', 'so', 'too', 'my', 'the', 'and', 'but', 'are', 'very', 'here', 'even', 'from', 'them', 'then', 'than', 'this', 'that', 'though']
def load_categories(self):
try:
cat_db = open('categories.pkl', 'rb')
self.categories = cPickle.load(cat_db)
cat_db.close()
except:
print 'Load of categories file failed'
def add_category(self, f, cat_name):
f_in = open(f)
self.text = f_in.read().lower()
f_in.close()
self.wordify()
self.unstopify()
self.unleafify()
self.categories[cat_name] = {}
for item in self.unleaf:
if self.categories[cat_name].has_key(item):
self.categories[cat_name][item] += 1
else:
self.categories[cat_name][item] = 1
def save_categories(self):
cat_db = open('categories.pkl', 'wb')
cPickle.dump(self.categories, cat_db, -1)
cat_db.close()
def add_text(self, f):
f_in = open(f)
self.text = f_in.read().lower()
f_in.close()
self.wordify()
self.unstopify()
self.unleafify()
self.indexify()
self.all_texts[f] = {}
for item in self.unleaf:
if self.all_texts[f].has_key(item):
self.all_texts[f][item] += 1
else:
self.all_texts[f][item] = 1
def wordify(self):
words_pat = re.compile('\\w+')
self.words = words_pat.findall(self.text)
def unstopify(self):
self.unstop = [item for item in self.words if item not in self.stop_words]
def unleafify(self):
self.unleaf = self.unstop[:]
for leaf in self.leaf_words:
leaf_len = len(leaf)
leaf_pat = re.compile('%s$' % leaf)
for i in range(len(self.unleaf)):
if leaf_pat.findall(self.unleaf[i]):
self.unleaf[i] = self.unleaf[i][:-leaf_len]
def knn_calc(self):
for text in self.all_texts.keys():
self.knn_results[text] = {}
for category in self.categories.keys():
self.knn_results[text][category] = {}
iterations = 0
distance = 0
for word in self.all_texts[text].keys():
if word in self.categories[category].keys():
distance += (self.all_texts[text][word] - self.categories[category][word]) ** 2
iterations += 1 …