I'm stuck on this problem for a day. Since no one responds my question on stack overflow, I hope somebody here may help.

I'm trying to build a dictionary of shakespeare's plays that records the act and scene of the first speech of each character, and the number of speeches of that character (not the first time they are mentioned). Here is the link for the text. For example,in Romeo and Juliet would be 'JULIET': [1, 3, 118] since Juliet first speaks in Act 1, Scene 3 and has 118 speeches in the play. I try to split into lines, but it doesn't give the Act and Scene number.

import string
import re

def word_find(line,words):
  return list(set(line.strip().split()) & set(words))

def main (fn,words):
  d = {}
  fn = open(fn,'r')
  r = fn.read().split()
  for c,x in r(f, start=1):
    common = word_find(x,words)
    if common:
        print (c), "".join(common)

if __name__ == '__main__':
  main(fn, words)

print(firstWords('romeo_and_juliet_folger.txt'))
print(firstWords('shakespeare_sonnet_18.txt'))
print(firstWords('a_midsummer_nights_dream_folger.txt'))

def firstWords (fn):
  d = {}
  fn = open(fn,'r')
  r = fn.read().split()
  for c in r:
    c= removePunctuation(c.lower())
    d[c] = d.get(c,0)+1
    fn.close()
    return d

print(firstWords('romeo_and_juliet_folger.txt'))
print(firstWords('shakespeare_sonnet_18.txt'))
print(firstWords('a_midsummer_nights_dream_folger.txt'))

You could start by scanning the text with the re module

#!/usr/bin/env python
# -*-coding: utf8-*-
'''doc
'''
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

import io
import re
act_re = re.compile(r'^\s*ACT\s+(\d+)\s*$')
scene_re = re.compile(r'^\s*Scene\s+(\d+)\s*$')
character_re = re.compile(r'^(\s*[A-Z]{2}[A-Z]*)+')

def scan(lines):
    for x in lines:
        mo = act_re.match(x)
        if mo:
            yield ('ACT', int(mo.group(1)))
            continue
        mo = scene_re.match(x)
        if mo:
            yield ('SCENE', int(mo.group(1)))
            continue
        mo = character_re.match(x)
        if mo:
            yield ('CHARACTER', mo.group(0))

def main():
    with io.open('Rom.txt', 'r', encoding='utf8') as lines:
        for token in scan(lines):
            print(token)

if __name__ == '__main__':
    main()

""" my output:
...
('CHARACTER', 'NURSE')
('CHARACTER', 'JULIET')
('CHARACTER', 'NURSE')
('CHARACTER', 'JULIET')
('CHARACTER', 'NURSE')
('ACT', 2)
('SCENE', 1)
('CHARACTER', 'ROMEO')
('CHARACTER', 'BENVOLIO')
('CHARACTER', 'MERCUTIO')
('CHARACTER', 'BENVOLIO')
('CHARACTER', 'MERCUTIO')
('CHARACTER', 'BENVOLIO')
...
"""

Edited 4 Weeks Ago by Gribouillis