I want to create a method in Python that will read all of the text on a webpage and put it into a text file. Eventually I will have it distinguish between hyperlinks by having it exclude lines in the text containing href="....."

Any help would be greatly appreciated.

One of the more simple ways to do this is to use Python module HTMLParser ...

# extract text from HTML code of a web site

import urllib2
import HTMLParser
import cStringIO

class HTML2Text(HTMLParser.HTMLParser):
    extract text from HTML code
    def __init__(self):
        self.output = cStringIO.StringIO()

    def get_text(self):
        """get the text output"""
        return self.output.getvalue()

    def handle_starttag(self, tag, attrs):
        """handle <br> tags"""
        if tag == 'br':
            # Need to put a new line in

    def handle_data(self, data):
        """normal text"""

    def handle_endtag(self, tag):
        if tag == 'p':
            # end of paragraph. Add newline.

# test it ...
if __name__ == '__main__':
    urlStr = 'http://www.python.org/'
      fileHandle = urllib2.urlopen(urlStr)
      html = fileHandle.read()
    except IOError:
      print 'Cannot open URL %s for reading' % urlStr
    #print html  # test only

    print '-'*50
    print 'Text ectracted from HTML code of URL =', urlStr
    print '-'*50
    p = HTML2Text()
    text = p.get_text()
    # remove all the empty lines and leading/trailing white spaces from
    # the raw extracted text add back the newline character to each line
    raw_list = text.splitlines()
    new_list = []
    for line in raw_list:
        line = line.strip()
        if line != '':
            line = line + '\n'
    #print new_list  # test only
    clean_text = "".join(new_list)
    print clean_text
Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, learning, and sharing knowledge.