0

Hello..
Im doing some simple text processing using Python which include indexing, splitting and tokenizing text from folder. i want to improve the stemming and tagging process using MontyLingua by importing the Monty library for tokenizing and tagging. I dont know how to edit and call them in my coding. Anybody know how to use and call theMontyTagger and theMontyTokenizer? Pls provide me some coding example.
This is how my existing code looks like in IDLE (python) GUI:

import re
import os
import sqlobject
import re, math
import sgmllib, string
from numpy import*
import adodb
from nltk_lite.corpora import stopwords
from nltk_lite import stem
import MySQLdb

#stopword and stem
stopwords_list=list(stopwords.raw('english'))
stemmer=stem.Porter()

class SearchEngine:

    def __init__(self,filePath,files,filename,words,word,freq,wordst,wordss,no_id):

        self.filePath=filePath
        self.files=files
        self.filename=filename
        self.words=words
        self.word=word
        self.freq=freq
        self.wordst=wordst
        self.wordss=wordss
        self.no_id=no_id

    def textProcessing(self):

    #open file
        outfile=open('output_kira.txt','w')
        outfile3=open('ayat.txt','w')
        self.no_id=0

    #read folder
        for self.filename in os.listdir("C:/Users/abc1234/Documents/Google Talk Received Files/installer/installer/ayat"):
            fullpath=os.path.join("C:/Users/abc1234/Documents/Google Talk Received Files/installer/installer/ayat",self.filename)
            self.filenames=self.filename
            self.no_id=self.no_id+1
            print self.filename
            outfile.write("\n%s\n"%(self.filename))
            outfile3.write("\n%s\n"%(self.filename))
            infile=open(fullpath,'r')

    #tokenizer
            content=infile.read()
            self.words=content.split()
            print 'Words in text:',len(self.words)
            outfile.write("Words in text:%d\n"%(len(self.words)))
            freq_dic={}
            punctuation=re.compile('r[""]')
            for self.word in self.words:

    #remove punctuation marks
                self.word=punctuation.sub("",self.word)

    #from dictionary
                try:
                    freq_dic[self.word]+=1
                except:
                    freq_dic[self.word]=1

    #print unique word
            print("Unique words:%d\n"%(len(freq_dic)))
            outfile.write("Unique words:%d\n"%(len(freq_dic)))

    #create list of (key, val) tuple pairs
            freq_list=freq_dic.items()

    #sort by key or word
            freq_list.sort()
            for self.word in self.words:
                outfile3.write("%s"%(self.word))
                if self.word.endswith('.') and self.word!='Dr.':
                    outfile3.write("\n\n")

    #indexing
            for self.word, self.freq in freq_list:
                if self.word not in stopwords_list:
                    if self.word!=self.words[1]:
                        self.wordst=self.word

                        if self.freq>0:
                            if self.wordst!='-' and self.wordst!='<' and self.wordst!='Dr.' and self.wordst!=':' and self.wordst!='&':

                                    #connect database
                                    connection=MySQLdb.connect(host="localhost",
                                    user="root",
                                    passwd="",
                                    db="test1")
                                    cursor=connection.cursor()
                                    if self.wordst!=('') and self.wordst!=('&#8212'):
                                        print self.wordst

                                        sqlstmt="INSERT INTO tabletest1(term,id,idx,frq,filename) VALUES('%s','%d','%d','%d','%s')"%(self.wordst,self.freq,self.words.index(self.word),self.no_id,self.filename)
                                        outfile.write("WORD=%s INDEX=%d FREKUENSI=%d URL=%s ID=%d FILE NAME=%s\n"%(self.wordst,self.words.index(self.word),self.freq,self.words[1],self.no_id,self.filename))
                                        cursor.execute(sqlstmt)
                                        cursor.close()
                                        connection.commit()
                                        connection.close()

        infile.close()
        outfile.close()
        outfile3.close()

i=SearchEngine('','','','','','','','','')
i.textProcessing()

Edited by Reverend Jim: Fixed formatting

2
Contributors
1
Reply
4
Views
7 Years
Discussion Span
Last Post by PassBy
0

Try this:

import MontyLingua

theTagger = MontyLingua.MontyLingua()

String = "your-string-to-tag"

#tokenized the String
tokenizedString = theTagger.tokenize(String)

#Tag the tokenized String
tagString = theTagger.tag_tokenized(tokenizedString)

#A more simple way to tag the String
tagString = theTagger.jist(String)

This topic has been dead for over six months. Start a new discussion instead.
Have something to contribute to this discussion? Please be thoughtful, detailed and courteous, and be sure to adhere to our posting rules.