My code is:
import re import urllib import urllib2 webURL="http://www.sc.iitb.ac.in/~bijnan/personal-details.htm" #the website is connect=urllib.urlopen(webURL) #connect to this website htmlDoc=connect.read()#get the html document from this website patternIN="Permanent Address" # Where to begin to keep the text patternOUT="</tr>" # Where to end to keep the text (after the begining) keepText=False # Do we keep the text ? address="" # We init the address # Now, we read the file to keep the text for line in htmlDoc: if keepText: address+=line.strip() # We store the line, stripping the \n if patternOUT in line: # Next line won't be kept any more keepText=False if patternIN in line: # Starting from next line, we keep the text keepText=True # Now, it's time to clean all this rTags=re.compile("<.*?>") # the regexp to recognise any tag address=rTags.sub(":", address) # we replace the tags with ":" (I could have chosen anything else, # especially if there is some ":" in the address rSep=re.compile(":+") # Now, we replace any number of ":" with a \n address=rSep.sub("\n", address) print address
For line 15...whats wrong there?why i cannot do the for loop in the html file?