Python also has HTMLParser module that can help you muchly:
# extract a specified text from web page HTML source code
import urllib2
import HTMLParser
import cStringIO # acts like file in memory
class HTML2Text(HTMLParser.HTMLParser):
"""
extract text from HTML code basically using inherited
class HTMLParser and some additional custom methods
"""
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.output = cStringIO.StringIO()
def get_text(self):
"""get the text output"""
return self.output.getvalue()
def handle_starttag(self, tag, attrs):
"""handle <br> tags"""
if tag == 'br':
# need to put one new line in
self.output.write('\n')
def handle_data(self, data):
"""normal text"""
self.output.write(data)
def handle_endtag(self, tag):
if tag == 'p':
# end of paragraph add newline
self.output.write('\n')
def extract(html, sub1, sub2):
"""
extract string from text between first
occurances of substrings sub1 and sub2
"""
return html.split(sub1, 1)[-1].split(sub2, 1)[0]
# you may need to update this web page for your needs
url = 'http://www.bom.gov.au/products/IDN10060.shtml#HUN'
# get the raw HTML code
try:
file_handle = urllib2.urlopen(url)
html1 = file_handle.read()
file_handle.close()
print '-'*70
print 'Data from URL =', url
except IOError:
print 'Cannot open URL %s for reading' % url
html1 = 'error!'
#print '-'*70; print html1 # testing
# extract code between sub1 and sub2
# you may need to update sub1 and sub2 for your needs
sub1 = 'www.bom.gov.au/weather/nsw</a></P><P>'
sub2 = 'The next routine forecast'
html2 = extract(html1, sub1, sub2)
#print '-'*70; print html2 # testing
# remove HTML tags to give clean text
p = HTML2Text()
p.feed(html2)
text = p.get_text()
print '-'*70
print text
print '-'*70
You can …