Aproach to the implementation of K-Nearest Neighbor (KNN) using the Euclidean algorithm.

Sample Usage:

mywork = Words_Works()

lit = 'literature.txt'
mywork.add_category(lit, 'Literature')         # adding files as category
comp = 'computers.txt'
mywork.add_category(comp, 'Computers')
phy = 'physics.txt'
mywork.add_category(phy, 'Physics')
                                               # saving categories dictionary to file
mywork.save_categories()                       # can be loaded calling load_categories()

print mywork.categories                        # prints categories dictionary
print

txts = ('sample1.txt', 'sample2.txt')          # creating list of files to add

for text in txts:
    mywork.add_text(text)                      # adding files

print mywork.all_texts                         # prints files word ocurrence count
print

mywork.knn_calc()                              # perform calculation 

print mywork.knn_results                       # print overall results
print

mywork.knn()                                   # print files results

Cheers and Happy coding!

import cPickle
import re
from math import sqrt

class Words_Works():

    def __init__(self):
        self.all_texts = {}
        self.categories = {}
        self.knn_results = {}
        self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']
        self.stop_words = ['a', 'i', 'it', 'am', 'at', 'on', 'in', 'of', 'to', 'is', 'so', 'too', 'my', 'the', 'and', 'but', 'are', 'very', 'here', 'even', 'from', 'them', 'then', 'than', 'this', 'that', 'though']

    def add_category(self, f, cat_name):
        f_in = open(f)
        self.text = f_in.read().lower()
        f_in.close()
        self.wordify()
        self.unstopify()
        self.unleafify()
        self.categories[cat_name] = {}        
        for item in self.unleaf:
            if self.categories[cat_name].has_key(item):
                self.categories[cat_name][item] += 1
            else:
                self.categories[cat_name][item] = 1

    def load_categories(self):
        try:
            cat_db = open('categories.pkl', 'rb')
            self.categories = cPickle.load(cat_db)
            cat_db.close()
            print 'Loading of categories file successfull'
        except:
            print 'Loading of categories file failed'

    def save_categories(self):
        cat_db = open('categories.pkl', 'wb')
        cPickle.dump(self.categories, cat_db, -1)
        cat_db.close()

    def add_text(self, f):
        f_in = open(f)
        self.text = f_in.read().lower()
        f_in.close()
        self.wordify()
        self.unstopify()
        self.unleafify()
        self.indexify()
        self.all_texts[f] = {}        
        for item in self.unleaf:
            if self.all_texts[f].has_key(item):
                self.all_texts[f][item] += 1
            else:
                self.all_texts[f][item] = 1

    def wordify(self):
        words_pat = re.compile('\\w+')
        self.words = words_pat.findall(self.text)

    def unstopify(self):
        self.unstop = [item for item in self.words if item not in self.stop_words]

    def unleafify(self):
        self.unleaf = self.unstop[:]
        for leaf in self.leaf_words:
            leaf_len = len(leaf)
            leaf_pat = re.compile('%s$' % leaf)
            for i in range(len(self.unleaf)):
                if leaf_pat.findall(self.unleaf[i]):
                    self.unleaf[i] = self.unleaf[i][:-leaf_len]

    def knn_calc(self):
        for text in self.all_texts.keys():
            self.knn_results[text] = {}
            for category in self.categories.keys():
                self.knn_results[text][category] = {}
                iterations = 0
                distance = 0
                for word in self.all_texts[text].keys():
                    if word in self.categories[category].keys():
                        distance += (self.all_texts[text][word] - self.categories[category][word]) ** 2
                        iterations += 1
                distance = sqrt(distance)
                self.knn_results[text][category]['KNN distance'] = distance
                self.knn_results[text][category]['KNN iterations'] = iterations

    def knn(self):
        for text in self.all_texts.keys():
            result = None
            for category in self.categories.keys():
                if not result or self.knn_results[text][category]['KNN distance'] < result:
                    knn = category
                    distance = self.knn_results[text][category]['KNN distance']
                    iterations = self.knn_results[text][category]['KNN iterations']
            print 'File:', text
            print 'KNN:', category
            print 'Distance:', distance
            print 'Iterations:', iterations
            print

-Fixed bug on the knn display.
-Improved calculation accuracy.
-Auto save and auto load of categories

import re
from math import sqrt
try:
    import cPickle as pickle
except:
    import pickle


class Words_Works():

    def __init__(self):
        self.all_texts = {}
        self.categories = {}
        self.knn_results = {}
        self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']
        self.stop_words = ['a', 'i', 'it', 'am', 'at', 'on', 'in', 'of', 'to', 'is', 'so', 'too', 'my', 'the', 'and', 'but', 'are', 'very', 'here', 'even', 'from', 'them', 'then', 'than', 'this', 'that', 'though']
        self.__load_categories()

    def __load_categories(self):
        try:
            cat_db = open('categories.pkl', 'rb')
            self.categories = pickle.load(cat_db)
            cat_db.close()
            categories = list(self.categories.keys())
            print '%s categories loaded:' % len(self.categories.keys())
            print ', '.join(self.categories.keys())
            print
        except:
            print 'No categories were loaded.\n'

    def add_category(self, f, cat_name):
        f_in = open(f)
        self.text = f_in.read().lower()
        f_in.close()
        self.__wordify()
        self.__unstopify()
        self.__unleafify()
        self.categories[cat_name] = {}        
        for item in self.unleaf:
            if self.categories[cat_name].has_key(item):
                self.categories[cat_name][item] += 1
            else:
                self.categories[cat_name][item] = 1
        self.__save_categories()

    def del_category(self, cat_name):
        if cat_name in self.categories.keys():
            del self.categories[cat_name]
        self.__save_categories()            

    def __save_categories(self):
        cat_db = open('categories.pkl', 'wb')
        pickle.dump(self.categories, cat_db, -1)
        cat_db.close()

    def add_text(self, f):
        f_in = open(f)
        self.text = f_in.read().lower()
        f_in.close()
        self.__wordify()
        self.__unstopify()
        self.__unleafify()
        self.all_texts[f] = {}        
        for item in self.unleaf:
            if self.all_texts[f].has_key(item):
                self.all_texts[f][item] += 1
            else:
                self.all_texts[f][item] = 1

    def __wordify(self):
        words_pat = re.compile('\\w+')
        self.words = words_pat.findall(self.text)

    def __unstopify(self):
        self.unstop = [item for item in self.words if item not in self.stop_words]

    def __unleafify(self):
        self.unleaf = self.unstop[:]
        for leaf in self.leaf_words:
            leaf_len = len(leaf)
            leaf_pat = re.compile('%s$' % leaf)
            for i in range(len(self.unleaf)):
                if leaf_pat.findall(self.unleaf[i]):
                    self.unleaf[i] = self.unleaf[i][:-leaf_len]

    def knn_calc(self):
        for text in self.all_texts.keys():
            self.knn_results[text] = {}
            for category in self.categories.keys():
                self.knn_results[text][category] = {}
                distance = 0
                for word in self.all_texts[text].keys():
                    if word in self.categories[category].keys():
                        distance += (self.all_texts[text][word] - self.categories[category][word]) ** 2
                    else:
                        distance += (self.all_texts[text][word] - 0) ** 2 
                distance = sqrt(distance)
                self.knn_results[text][category]['KNN distance'] = distance
        self.knn()

    def knn(self):
        for text in self.all_texts.keys():
            result = None
            for category in self.categories.keys():
                if not result or self.knn_results[text][category]['KNN distance'] < result:
                    knn = category
                    distance = self.knn_results[text][category]['KNN distance']
                    result = distance
            print 'File:', text
            print 'KNN:', knn
            print 'Distance: %.3f' % distance
            print

Sample usage:

mywork = Words_Works()                         # Initializa

lit = 'literature.txt'

mywork.add_category(lit, 'Literature')               # Adding as category

comp = 'computers.txt'

mywork.add_category(comp, 'Computer Science')

phy = 'physics.txt'

mywork.add_category(phy, 'Physics')

for category in mywork.categories.keys():              # Print categories
    print category
    print mywork.categories[category]
    print
print

txts = ('sample1.txt', 'sample2.txt')                  # Creating list of files to analyse

for text in txts:                                      # Adding them
    mywork.add_text(text)

for text in mywork.all_texts.keys():                   # Print texts
    print text
    print mywork.all_texts[text]
    print
print

mywork.knn_calc()                                         # calculate knn

for files in mywork.knn_results.keys():                   # print detailed results
    print files
    for category in mywork.knn_results[files].keys():
        print category
        print mywork.knn_results[files][category]
    print
print

mywork.knn()                                              # Display results

Maybe this pattern should be made in init better or in beginnig of loading the module? At least not inside loop, I think.

leaf_pat = re.compile('%s$' % leaf)

You could make ready list self.leaf_pats, no need probably for the words itself. This is the version i made for find words, unleafify and unstoppyfy in connection to one discussion thread (input word is splitted from the text file, with extra 'rubbish'):

stopwords=set(w.rstrip() for w in (open('stopwords.txt')))

leaf_words = ("ing","es","ed","er","ly",'s')

notthese = string.punctuation + string.digits + string.whitespace

def wordprepare(word):
    word = word.lower().strip(notthese)
    if word and word not in stopwords:
        for leaf in (lword for lword in leaf_words
                      if word.endswith(lword)):
                        word=word[:-len(leaf)]
                        break
        return (word.rstrip(notthese),)
    return ()

Probably your regexp method is more effective in getting the words from file.

Also with order of leaf words you have, you probably never use the 'es' ending.

Edited 6 Years Ago by pyTony: n/a

The pattern must be done inside the loop since they are 6 different patterns.

And about the order, for all the information I could get I believe that it is right.

6 different patterns:

## in init
       self.leaf_pats = [(re.compile('%s$' % leaf),len(leaf))
                          for leaf in ['s', 'es', 'ed', 'er', 'ly', 'ing']
                          ]
    def __unleafify(self):
        self.unleaf = self.unstop[:]
        for leaf_pat,leaf_len in self.leaf_pats:
            for i in range(len(self.unleaf)):
                if leaf_pat.findall(self.unleaf[i]):
                    self.unleaf[i] = self.unleaf[i][:-leaf_len]

The 'es' words:

Prove test file with only 'apples', the dict becomes :

{'apples.txt': {'apple': 1}}

It should have 'appl' by removing 'es', shouldn't it? Or take out 'es', as it will never find anything, as 's' will be dropped by 's' pattern.

About the patterns, I think it makes sense if you let the leaf_words private, but then it takes out the possibility of altering them and running it with different orders for testing purposes.

No, You must have 'apple', not 'appl'.

Process - proces - proc

For me it is strange, but maybe that is only because I am not linguist. I would like to for example make
handles->handl
handling -> handl
handler -> handl
handled -> handl

So actually the verbs is more the point than nouns.

BTW my Word 2003 complains about process should it be maybe
processes -> processe

But for the algorith in question all the infos I got, the more precise would be the way I implemented, but we must see also that the level of accuracy of a so simple algorithm is not great.

Anyway, i also don't like the reliability of that way of stemming.

I've got a new version to be posted, where I use a implementation of a real stemmer, but the truth is that if you compare the results of the two, you will see the the method current used is very good in general.

Cheers, and thanks by the comments mate.

Happy coding

The article starter has earned a lot of community kudos, and such articles offer a bounty for quality replies.