No one has voted on any posts yet. Votes from other community members are used to determine a member's reputation amongst their peers.
i have a text document 'topics.txt': 1~cocoa 2~ 3~ 4~ 5~grain~wheat~corn~barley~oat~sorghum 6~veg-oil~linseed~lin-oil~soy-oil~sun-oil~soybean~oilseed~corn~sunseed~grain~sorghum~wheat 7~ 8~ 9~earn 10~acq and so on.. here the numbers correspond to the file names, i have about 20000 files. import os import re import sys sys.stdout=open('f1.txt','w') from collections import Counter from glob import glob def removegarbage(text): text=re.sub(r'\W+',' … |
|
import os import re import sys sys.stdout=open('f1.txt','w') from collections import Counter from glob import glob def removegarbage(text): text=re.sub(r'\W+',' ',text) text=text.lower() return text folderpath='d:/induvidual-articles' counter=Counter() filepaths = glob(os.path.join(folderpath,'*.txt')) num_files = len(filepaths) with open('topics.txt','r') as filehandle: lines = filehandle.read() words = removegarbage(lines).split() counter.update(words) for word, count in counter.most_common(): probability=count//num_files print('{} {} {}'.format(word,count,probability)) … |