0

I want to create a method in Python that will read all of the text on a webpage and put it into a text file. Eventually I will have it distinguish between hyperlinks by having it exclude lines in the text containing href="....."

Any help would be greatly appreciated.

2
Contributors
1
Reply
2
Views
9 Years
Discussion Span
Last Post by vegaseat
0

One of the more simple ways to do this is to use Python module HTMLParser ...

# extract text from HTML code of a web site

import urllib2
import HTMLParser
import cStringIO

class HTML2Text(HTMLParser.HTMLParser):
    """
    extract text from HTML code
    """
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.output = cStringIO.StringIO()

    def get_text(self):
        """get the text output"""
        return self.output.getvalue()

    def handle_starttag(self, tag, attrs):
        """handle <br> tags"""
        if tag == 'br':
            # Need to put a new line in
            self.output.write('\n')

    def handle_data(self, data):
        """normal text"""
        self.output.write(data)

    def handle_endtag(self, tag):
        if tag == 'p':
            # end of paragraph. Add newline.
            self.output.write('\n')


# test it ...
if __name__ == '__main__':
    urlStr = 'http://www.python.org/'
    try:
      fileHandle = urllib2.urlopen(urlStr)
      html = fileHandle.read()
      fileHandle.close()
    except IOError:
      print 'Cannot open URL %s for reading' % urlStr
      
    #print html  # test only

    print '-'*50
    print 'Text ectracted from HTML code of URL =', urlStr
    print '-'*50
    
    p = HTML2Text()
    p.feed(html)
    text = p.get_text()
    # remove all the empty lines and leading/trailing white spaces from
    # the raw extracted text add back the newline character to each line
    raw_list = text.splitlines()
    new_list = []
    for line in raw_list:
        line = line.strip()
        if line != '':
            line = line + '\n'
            new_list.append(line)
        
    #print new_list  # test only
        
    clean_text = "".join(new_list)
    print clean_text
This question has already been answered. Start a new discussion instead.
Have something to contribute to this discussion? Please be thoughtful, detailed and courteous, and be sure to adhere to our posting rules.