Aproach to the implementation of K-Nearest Neighbor (KNN) using the Euclidean algorithm.

Sample Usage:

``````mywork = Words_Works()

lit = 'literature.txt'
comp = 'computers.txt'
phy = 'physics.txt'
# saving categories dictionary to file

print mywork.categories                        # prints categories dictionary
print

txts = ('sample1.txt', 'sample2.txt')          # creating list of files to add

for text in txts:

print mywork.all_texts                         # prints files word ocurrence count
print

mywork.knn_calc()                              # perform calculation

print mywork.knn_results                       # print overall results
print

mywork.knn()                                   # print files results``````

Cheers and Happy coding!

391 Views
``````import cPickle
import re
from math import sqrt

class Words_Works():

def __init__(self):
self.all_texts = {}
self.categories = {}
self.knn_results = {}
self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']
self.stop_words = ['a', 'i', 'it', 'am', 'at', 'on', 'in', 'of', 'to', 'is', 'so', 'too', 'my', 'the', 'and', 'but', 'are', 'very', 'here', 'even', 'from', 'them', 'then', 'than', 'this', 'that', 'though']

f_in = open(f)
f_in.close()
self.wordify()
self.unstopify()
self.unleafify()
self.categories[cat_name] = {}
for item in self.unleaf:
if self.categories[cat_name].has_key(item):
self.categories[cat_name][item] += 1
else:
self.categories[cat_name][item] = 1

try:
cat_db = open('categories.pkl', 'rb')
cat_db.close()
except:

def save_categories(self):
cat_db = open('categories.pkl', 'wb')
cPickle.dump(self.categories, cat_db, -1)
cat_db.close()

f_in = open(f)
f_in.close()
self.wordify()
self.unstopify()
self.unleafify()
self.indexify()
self.all_texts[f] = {}
for item in self.unleaf:
if self.all_texts[f].has_key(item):
self.all_texts[f][item] += 1
else:
self.all_texts[f][item] = 1

def wordify(self):
words_pat = re.compile('\\w+')
self.words = words_pat.findall(self.text)

def unstopify(self):
self.unstop = [item for item in self.words if item not in self.stop_words]

def unleafify(self):
self.unleaf = self.unstop[:]
for leaf in self.leaf_words:
leaf_len = len(leaf)
leaf_pat = re.compile('%s\$' % leaf)
for i in range(len(self.unleaf)):
if leaf_pat.findall(self.unleaf[i]):
self.unleaf[i] = self.unleaf[i][:-leaf_len]

def knn_calc(self):
for text in self.all_texts.keys():
self.knn_results[text] = {}
for category in self.categories.keys():
self.knn_results[text][category] = {}
iterations = 0
distance = 0
for word in self.all_texts[text].keys():
if word in self.categories[category].keys():
distance += (self.all_texts[text][word] - self.categories[category][word]) ** 2
iterations += 1
distance = sqrt(distance)
self.knn_results[text][category]['KNN distance'] = distance
self.knn_results[text][category]['KNN iterations'] = iterations

def knn(self):
for text in self.all_texts.keys():
result = None
for category in self.categories.keys():
if not result or self.knn_results[text][category]['KNN distance'] < result:
knn = category
distance = self.knn_results[text][category]['KNN distance']
iterations = self.knn_results[text][category]['KNN iterations']
print 'File:', text
print 'KNN:', category
print 'Distance:', distance
print 'Iterations:', iterations
print``````

-Fixed bug on the knn display.
-Improved calculation accuracy.
-Auto save and auto load of categories

``````import re
from math import sqrt
try:
import cPickle as pickle
except:
import pickle

class Words_Works():

def __init__(self):
self.all_texts = {}
self.categories = {}
self.knn_results = {}
self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']
self.stop_words = ['a', 'i', 'it', 'am', 'at', 'on', 'in', 'of', 'to', 'is', 'so', 'too', 'my', 'the', 'and', 'but', 'are', 'very', 'here', 'even', 'from', 'them', 'then', 'than', 'this', 'that', 'though']

try:
cat_db = open('categories.pkl', 'rb')
cat_db.close()
categories = list(self.categories.keys())
print '%s categories loaded:' % len(self.categories.keys())
print ', '.join(self.categories.keys())
print
except:

f_in = open(f)
f_in.close()
self.__wordify()
self.__unstopify()
self.__unleafify()
self.categories[cat_name] = {}
for item in self.unleaf:
if self.categories[cat_name].has_key(item):
self.categories[cat_name][item] += 1
else:
self.categories[cat_name][item] = 1
self.__save_categories()

def del_category(self, cat_name):
if cat_name in self.categories.keys():
del self.categories[cat_name]
self.__save_categories()

def __save_categories(self):
cat_db = open('categories.pkl', 'wb')
pickle.dump(self.categories, cat_db, -1)
cat_db.close()

f_in = open(f)
f_in.close()
self.__wordify()
self.__unstopify()
self.__unleafify()
self.all_texts[f] = {}
for item in self.unleaf:
if self.all_texts[f].has_key(item):
self.all_texts[f][item] += 1
else:
self.all_texts[f][item] = 1

def __wordify(self):
words_pat = re.compile('\\w+')
self.words = words_pat.findall(self.text)

def __unstopify(self):
self.unstop = [item for item in self.words if item not in self.stop_words]

def __unleafify(self):
self.unleaf = self.unstop[:]
for leaf in self.leaf_words:
leaf_len = len(leaf)
leaf_pat = re.compile('%s\$' % leaf)
for i in range(len(self.unleaf)):
if leaf_pat.findall(self.unleaf[i]):
self.unleaf[i] = self.unleaf[i][:-leaf_len]

def knn_calc(self):
for text in self.all_texts.keys():
self.knn_results[text] = {}
for category in self.categories.keys():
self.knn_results[text][category] = {}
distance = 0
for word in self.all_texts[text].keys():
if word in self.categories[category].keys():
distance += (self.all_texts[text][word] - self.categories[category][word]) ** 2
else:
distance += (self.all_texts[text][word] - 0) ** 2
distance = sqrt(distance)
self.knn_results[text][category]['KNN distance'] = distance
self.knn()

def knn(self):
for text in self.all_texts.keys():
result = None
for category in self.categories.keys():
if not result or self.knn_results[text][category]['KNN distance'] < result:
knn = category
distance = self.knn_results[text][category]['KNN distance']
result = distance
print 'File:', text
print 'KNN:', knn
print 'Distance: %.3f' % distance
print``````

Sample usage:

``````mywork = Words_Works()                         # Initializa

lit = 'literature.txt'

comp = 'computers.txt'

phy = 'physics.txt'

for category in mywork.categories.keys():              # Print categories
print category
print mywork.categories[category]
print
print

txts = ('sample1.txt', 'sample2.txt')                  # Creating list of files to analyse

for text in txts:                                      # Adding them

for text in mywork.all_texts.keys():                   # Print texts
print text
print mywork.all_texts[text]
print
print

mywork.knn_calc()                                         # calculate knn

for files in mywork.knn_results.keys():                   # print detailed results
print files
for category in mywork.knn_results[files].keys():
print category
print mywork.knn_results[files][category]
print
print

mywork.knn()                                              # Display results``````

Maybe this pattern should be made in init better or in beginnig of loading the module? At least not inside loop, I think.

``leaf_pat = re.compile('%s\$' % leaf)``

You could make ready list self.leaf_pats, no need probably for the words itself. This is the version i made for find words, unleafify and unstoppyfy in connection to one discussion thread (input word is splitted from the text file, with extra 'rubbish'):

``````stopwords=set(w.rstrip() for w in (open('stopwords.txt')))

leaf_words = ("ing","es","ed","er","ly",'s')

notthese = string.punctuation + string.digits + string.whitespace

def wordprepare(word):
word = word.lower().strip(notthese)
if word and word not in stopwords:
for leaf in (lword for lword in leaf_words
if word.endswith(lword)):
word=word[:-len(leaf)]
break
return (word.rstrip(notthese),)
return ()``````

Probably your regexp method is more effective in getting the words from file.

Also with order of leaf words you have, you probably never use the 'es' ending.

The pattern must be done inside the loop since they are 6 different patterns.

And about the order, for all the information I could get I believe that it is right.

6 different patterns:

``````## in init
self.leaf_pats = [(re.compile('%s\$' % leaf),len(leaf))
for leaf in ['s', 'es', 'ed', 'er', 'ly', 'ing']
]
def __unleafify(self):
self.unleaf = self.unstop[:]
for leaf_pat,leaf_len in self.leaf_pats:
for i in range(len(self.unleaf)):
if leaf_pat.findall(self.unleaf[i]):
self.unleaf[i] = self.unleaf[i][:-leaf_len]``````

The 'es' words:

Prove test file with only 'apples', the dict becomes :

``{'apples.txt': {'apple': 1}}``

It should have 'appl' by removing 'es', shouldn't it? Or take out 'es', as it will never find anything, as 's' will be dropped by 's' pattern.

No, You must have 'apple', not 'appl'.

Process - proces - proc

About the patterns, I think it makes sense if you let the leaf_words private, but then it takes out the possibility of altering them and running it with different orders for testing purposes.

No, You must have 'apple', not 'appl'.

Process - proces - proc

For me it is strange, but maybe that is only because I am not linguist. I would like to for example make
handles->handl
handling -> handl
handler -> handl
handled -> handl

So actually the verbs is more the point than nouns.

BTW my Word 2003 complains about process should it be maybe
processes -> processe

But for the algorith in question all the infos I got, the more precise would be the way I implemented, but we must see also that the level of accuracy of a so simple algorithm is not great.

Anyway, i also don't like the reliability of that way of stemming.

I've got a new version to be posted, where I use a implementation of a real stemmer, but the truth is that if you compare the results of the two, you will see the the method current used is very good in general.

Cheers, and thanks by the comments mate.

Happy coding