I am a programming and python beginner and thought this would be a fun exercise. I wrote this script to mine web pages. First it finds all of the hrefs on the page. Then it takes those urls and searches those pages for content. This is by no means perfect. For one, it searches hrefs only. And two, when I search the page for content I have to give it an offset to find 'text' content. It's not always ideal. I know the code is long and few will read it but I was wondering if anyone had any better approaches to doing this?
''' Data Mining Script - pageminer.py'''
from BeautifulSoup import BeautifulSoup
from urllib2 import HTTPError
from urlparse import urljoin, urlsplit
from optparse import OptionParser
from tempfile import TemporaryFile
from xlwt import Workbook, easyxf
EXCLUDES = [' ', '\n', '\r', '\t']
OUTPUT = 'output.xls'
STYLE6 = easyxf('pattern: pattern solid, fore_color grey25;'
'font: bold yes, height 160;'
'border: top medium, bottom medium')
''' Data Miner '''
def init(self, options, url):
self.url = url
self.options = options
self.keys = options.keys1.split(',')
def get_soup(self): ''' Parse HTML into BeautifulSoup ''' opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: soup = BeautifulSoup(opener.open(self.url)) self.get_links(doc=soup, keys=self.keys) except HTTPError: pass def get_links(self, doc, keys): ''' Use a search pattern provided by list(keys) to find hrefs that match. ''' links =  for link in doc.findAll('a', href=True): for key in keys: if re.search(key, link['href']): links.append(link['href']) self.build_links(links) def ...