#Import Story
text = open('./alice.txt', 'rb').read()


#Split the text into individual words
def split_text(text):
    #Import known words index
    index = open('./words.dat', 'rb').read().split()
    index_file = open('./words.dat','wb+')
    for word in index:
        index_file.write(word)
        index_file.write('\n')

    import string
    #Remove punctuation
    out = "".join(c for c in text if c not in string.punctuation).lower()
    # split the text
    words = out.split()
    # for each word in the text:
    word_count = 0
    for word in words:
        if word.endswith('.') or  word.endswith(',') or word.endswith('"') or word.endswith('”'):
            word = word[:-1]
        if word not in index:
            index.append(word)
            index_file.write(word)
            index_file.write('\n')
            word_count = word_count + 1
    print ('The total amount of new words is:')
    print (word_count)
    index_file.close()

split_text(text)

I keep getting an error on the "out =" line, "TypeError: 'in ' requires string as left operand" error

Doesn't work with unicode string based Python3 without decodings for read in files. Works without binary file input:

#-*- coding: utf-8 -*-
import string

#Split the text into individual words
def split_text(text):
    # get known words index
    index = open('./words.dat').read().split()
    index_file = open('./words.dat','a')
    for word in index:
        index_file.write(word)
        index_file.write('\n')

    #Remove punctuation
    out = "".join(c for c in text if c not in string.punctuation).lower()
    # split the text
    words = out.split()
    # for each word in the text:
    word_count = 0
    for word in words:
        if word.endswith('.') or  word.endswith(',') or word.endswith('"') or word.endswith('”'):
            word = word[:-1]
        if word not in index:
            index.append(word)
            index_file.write(word)
            index_file.write('\n')
            word_count = word_count + 1
    print ('The total amount of new words is:')
    print (word_count)
    index_file.close()

#read story
text = open('./alice.txt').read()

split_text(text)

Edited 2 Years Ago by pyTony

There are some room for improvement in this code.
In line 14 you are removing punctuation,then in line 20 you are testing for punctuation that's already removed.
There is no need to use a file and append word,just use set().
So then it could look like this.

import string

with open('alice.txt') as f:
    text = f.read().lower()

words = "".join(c for c in text if c not in string.punctuation).split()
print(len(set(words)))
import string

with open("alice.txt", "r") as f:
    text = f.read()
    # remove all punctuation marks and make lower case
    words = "".join(c for c in text if c not in string.punctuation).lower()
    # convert to a sorted list of unique words via set comprehension
    list_unique = sorted(list({w for w in words.split()}))

with open('words.dat', 'w') as i:
    for lines in list_unique:
        i.write(lines)
        i.write("\n")

How would I make it so that only words with 2 or more characters are put into that file?

And for the file path, if I leave it as is right now, would that work if someone else is running the program on their computer?

How would I make it so that only words with 2 or more characters are put into that file?

>>> s = 'hi this is a test'
>>> s = s.split()
>>> [i for i in s if len(i) >= 2]
['hi', 'this', 'is', 'test']

#With regular loop
>>> for item in s:
...     if len(item) >= 2:
...         print item
...         
hi
this
is
test

And for the file path, if I leave it as is right now, would that work if someone else is running the program on their computer?

If no path is given,files will always be in same folder as you or other run code(.py) from.

Edited 2 Years Ago by snippsat

Use module os ...
os.getcwd() return the current working directory (path)
os.chdir(path) change directory to the one in path

You would ask the user with an input for the required path.

Edited 2 Years Ago by vegaseat

import string

list_of_files = input("Enter in text files to read from: ")
list_of_files = list_of_files.split()

with open(list_of_files[0], "r") as f1, open(list_of_files[1], "r") as f2, open(list_of_files[2], "r") as f3:
    text = f1.read(), f2.read(), f3.read()
    # remove all punctuation marks and make lower case
    words = "".join(c for c in text if c not in string.punctuation).lower()
    # convert to a sorted list of unique words via set comprehension
    list_unique = sorted(list({w for w in words.split()}))
    list_final = [j for j in list_unique if len(j) >= 2]

with open("words.dat", "w") as i:
    for lines in list_final:
        i.write(lines)
        i.write("\n")

So it looks like it reads from all three files, but it includes the words with punctuation now?

So it looks like it reads from all three files, but it includes the words with punctuation now?

text is now a tuple.

>>> s = 'Test.'
>>> s1 = 'hello?'
>>> s2 =  '-world?,'
>>> text = s, s1, s2 #Python always make a tuple when you do this
>>> text
('Test.', 'hello?', '-world?,')
>>> "".join(c for c in text if c not in string.punctuation).lower()
'test.hello?-world?,'
>>> text = ' '.join(text) #Fix
>>> "".join(c for c in text if c not in string.punctuation).lower()
'test hello world

Learn to do small test as i do here.

Edited 2 Years Ago by snippsat

This article has been dead for over six months. Start a new discussion instead.