''' Count_find_duplicate_words101.py
find duplicate words in a text (preprocessed)
using Counter() from the Python module collections and set()
following a tip from raymondh
tested with Python27, IronPython27 and Python33  by vegaseat  24sep2013
'''

from string import punctuation
from collections import Counter

# sample text for testing
text = """\
If you see a turn signal blinking on a car with a southern license plate,
you may rest assured that it was on when the car was purchased."""

# preprocess text, remove punctuation marks and change to lower case
text2 = ''.join(c for c in text.lower() if c not in punctuation)

# text2.split() splits text2 at white spaces and returns a list of words
word_list = text2.split()

duplicate_word_list = sorted(Counter(word_list) - Counter(set(word_list)))

# show result
print("Original text:")
print(text)
print('-'*72)
print("A list of duplicate words in the text:")
print(duplicate_word_list)

''' result ...
Original text:
If you see a turn signal blinking on a car with a southern license plate,
you may rest assured that it was on when the car was purchased.
------------------------------------------------------------------------
A list of duplicate words in the text:
['a', 'car', 'on', 'was', 'you']
'''
2
Contributors
1
Reply
26
Views
3 Years
Discussion Span
Last Post by paddy3118
0

Neat.
Personally I would try:

duplicate_word_list = [word for word, count in Counter(word_list).most_common() if count > 1]

Just because I am used to using most_common with Counter.

Have something to contribute to this discussion? Please be thoughtful, detailed and courteous, and be sure to adhere to our posting rules.