# get some statistics of a story text
# count lines, sentences, words, frequent words ...
# tested with Python 2.5.4 and Python 3.1.1
# vegaseat 06oct2009
# test text (9 lines total, 2 blank lines, 8 sentences) ...
text = """\
Just a simple text we can use to count the sentences.
Looks like fun! Why do sentences have to end so soon?
Every now and then let's put in a blank line, so we
track those too. Perhaps something with a multitude of
characters.
Ah, another blank line for the count. Time for lunch!
That should do it for this longwinded test!"""
# write the test file
fname = "MyText1.txt"
fout = open(fname, "w")
fout.write(text)
fout.close()
# read the test file back in
# or change the filename to a text you have
textf = open(fname, "r")
# set all the counters to zero
lines = 0
blanklines = 0
# start with empty word list and character frequency dictionary
word_list = []
cf_dict = {}
# reads one line at a time
for line in textf:
# count lines and blanklines
lines += 1
if line.startswith('\n'):
blanklines += 1
# create a list of words
# split at any whitespace regardless of length
word_list.extend(line.split())
# create a character:frequency dictionary
# all letters adjusted to lower case
for char in line.lower():
cf_dict[char] = cf_dict.get(char, 0) + 1
textf.close()
# create a word frequency dictionary
# all words in lower case
word_dict = {}
# a list of punctuation marks (could use string.punctuation)
punctuations = [",", ".", "!", "?", ";", ":"]
for word in word_list:
# get last character of each word
lastchar = word[-1]
# remove any trailing punctuation marks from the word
if lastchar in punctuations:
word = word.rstrip(lastchar)
# convert to all lower case letters
word = word.lower()
word_dict[word] = word_dict.get(word, 0) + 1
# assume that each sentence ends with '.' or '!' or '?'
sentences = 0
for key in cf_dict.keys():
if key in '.!?':
sentences += cf_dict[key]
number_words = len(word_list)
#print word_list # test
#print cf_dict # test
#print word_dict # test
# formatted prints will work with Python2 and Python3
print( "Total lines: %d" % lines )
print( "Blank lines: %d" % blanklines )
print( "Sentences : %d" % sentences )
print( "Words : %d" % number_words )
print('-' * 30)
# optional things ...
# average word length
num = float(number_words)
avg_wordsize = len(''.join([k*v for k, v in word_dict.items()]))/num
# most common words
mcw = sorted([(v, k) for k, v in word_dict.items()], reverse=True)
# most common characters
mcc = sorted([(v, k) for k, v in cf_dict.items()], reverse=True)
print( "Average word length : %0.2f" % avg_wordsize )
print( "3 most common words : %s" % mcw[:3] )
print( "3 most common characters: %s" % mcc[:3] )
"""my result -->
Total lines: 9
Blank lines: 2
Sentences : 8
Words : 62
------------------------------
Average word length : 4.08
3 most common words : [(3, 'for'), (3, 'a'), (2, 'we')]
3 most common characters: [(57, ' '), (31, 'e'), (30, 't')]
"""