Data Scraping Using Urllib With Multiple Option Select Param

Question

blahman 0 Newbie Poster

13 Years Ago

I am running into some issue with scraping data. If I hardcode value for key "lbo race" in the code below it is able to scrape the data but if I try to set key "lbo race" to a variable which is being read in it doesn't seem to scrape the data correctly. I tried to put a time to slow it down but that doesn't seem to be the issue. Would I use threading to solve this problem?
Thanks!

import urllib.parse
import urllib.request
import csv
import time

def parseTable(html):
    #Each "row" of the HTML table will be a list, and the items
    #in that list will be the TD data items.
    ourTable = []

    #We keep these set to NONE when not actively building a
    #row of data or a data item.
    ourTD = None    #Stores one table data item
    ourTR = None    #List to store each of the TD items in.


    #State we keep track of
    inTable = False
    inTR = False
    inTD = False

    #Start looking for a start tag at the beginning!
    tagStart = html.find("<", 0)

    while( tagStart != -1):
        tagEnd = html.find(">", tagStart)

        if tagEnd == -1:    #We are done, return the data!
            return ourTable

        tagText = html[tagStart+1:tagEnd]

        #only look at the text immediately following the <
        tagList = tagText.split()
        tag = tagList[0]
        tag = tag.lower()

        #Watch out for TABLE (start/stop) tags!
        if tag == "table":      #We entered the table!
            inTable = True
        if tag == "/table":     #We exited a table.
            inTable = False

        #Detect/Handle Table Rows (TR's)
        if tag == "tr":
            inTR = True
            ourTR = []      #Started a new Table Row!

        #If we are at the end of a row, add the data we collected
        #so far to the main list of table data.
        if tag == "/tr":
            inTR = False
            ourTable.append(ourTR)
            ourTR = None

        #We are starting a Data item!
        if tag== "td" or tag== "th":
            inTD = True
            ourTD = ""      #Start with an empty item!

        #We are ending a data item!
        if tag == "/td" or tag=="/th":
            inTD = False
            if ourTD != None and ourTR != None:
                cleanedTD = ourTD.strip()   #Remove extra spaces
                ourTR.append( ourTD.strip() )
            ourTD = None


        #Look for the NEXT start tag. Anything between the current
        #end tag and the next Start Tag is potential data!
        tagStart = html.find("<", tagEnd+1)

        #If we are in a Table, and in a Row and also in a TD,
        # Save anything that's not a tag! (between tags)
        #
        #Note that this may happen multiple times if the table
        #data has tags inside of it!
        #e.g. <td>some <b>bold</b> text</td>
        #
        #Because of this, we need to be sure to put a space between each
        #item that may have tags separating them. We remove any extra
        #spaces (above) before we append the ourTD data to the ourTR list.
        if inTable and inTR and inTD:
            ourTD = ourTD + html[tagEnd+1:tagStart] + " "
            #print("td:", ourTD)   #for debugging


    #If we end the while loop looking for the next start tag, we
    #are done, return ourTable of data.
    return(ourTable)



url = "http://elections.sos.state.tx.us/elchist.exe"

files = open('1992DemocraticPrimaryElection.txt', 'r')
values = {'election' : "1992 Democratic Primary Election",
        'lboRace' : "",
        'btnSubmit' : "Submit"}
for line in files:
    linenew = line
    linenew = linenew.replace(' ','')
    linenew = linenew.replace('\n','')
    linenew = linenew.replace('"', '')
    file = open('1992DemocraticPrimaryElection.'+linenew+'.csv', 'w')
    for k, v in values.items():
        values['lboRace'] = line
        print(k, v)

    data = urllib.parse.urlencode(values)
    data = data.encode('ascii')

    req = urllib.request.Request(url, data)

    response = urllib.request.urlopen(req)
    html_bytes = response.read()
    html = str(html_bytes)

    dataTable = parseTable(html)
    writer = csv.writer(file)
    for item in dataTable:
        writer.writerow(item)
    file.close()
files.close()

python

Edited 13 Years Ago by Ezzaral because: Code formatting.

1 Contributor
1 Reply
542 Views
3 Days Discussion Span
Latest Post 13 Years Ago Latest Post by blahman

Reply to this topic

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.

blahman 0 Newbie Poster · Answer 1 · 2012-04-01T20:00:19+00:00

So it seems to be a problem with the response.read() any idea why it would only read part of the html data? It does it fine if I just do one request at a time. I also put some of the code into a function just incase so the variables are reset:

Code blocks are created by indenting at least 4 spaces
... and can span multiple lines

import urllib.parse

import urllib.request
import csv
import time
import http.cookiejar

def parseTable(html):
#Each "row" of the HTML table will be a list, and the items
#in that list will be the TD data items.
ourTable = []

#We keep these set to NONE when not actively building a
#row of data or a data item.
ourTD = None    #Stores one table data item
ourTR = None    #List to store each of the TD items in.


#State we keep track of
inTable = False
inTR = False
inTD = False

#Start looking for a start tag at the beginning!
tagStart = html.find("<", 0)

while( tagStart != -1):
    tagEnd = html.find(">", tagStart)

    if tagEnd == -1:    #We are done, return the data!
        return ourTable

    tagText = html[tagStart+1:tagEnd]

    #only look at the text immediately following the <
    tagList = tagText.split()
    tag = tagList[0]
    tag = tag.lower()

    #Watch out for TABLE (start/stop) tags!
    if tag == "table":      #We entered the table!
        inTable = True
    if tag == "/table":     #We exited a table.
        inTable = False

    #Detect/Handle Table Rows (TR's)
    if tag == "tr":
        inTR = True
        ourTR = []      #Started a new Table Row!

    #If we are at the end of a row, add the data we collected
    #so far to the main list of table data.
    if tag == "/tr":
        inTR = False
        ourTable.append(ourTR)
        ourTR = None

    #We are starting a Data item!
    if tag== "td" or tag== "th":
        inTD = True
        ourTD = ""      #Start with an empty item!

    #We are ending a data item!
    if tag == "/td" or tag=="/th":
        inTD = False
        if ourTD != None and ourTR != None:
            cleanedTD = ourTD.strip()   #Remove extra spaces
            ourTR.append( ourTD.strip() )
        ourTD = None


    #Look for the NEXT start tag. Anything between the current
    #end tag and the next Start Tag is potential data!
    tagStart = html.find("<", tagEnd+1)

    #If we are in a Table, and in a Row and also in a TD,
    # Save anything that's not a tag! (between tags)
    #
    #Note that this may happen multiple times if the table
    #data has tags inside of it!
    #e.g. <td>some <b>bold</b> text</td>
    #
    #Because of this, we need to be sure to put a space between each
    #item that may have tags separating them. We remove any extra
    #spaces (above) before we append the ourTD data to the ourTR list.
    if inTable and inTR and inTD:
        ourTD = ourTD + html[tagEnd+1:tagStart] + " "
        #print("td:", ourTD)   #for debugging


#If we end the while loop looking for the next start tag, we
#are done, return ourTable of data.
return(ourTable)

def scrape(values, line):
url = "http://elections.sos.state.tx.us/elchist.exe"

linenew = line
linenew = linenew.replace(' ','')
linenew = linenew.replace('\n','')
linenew = linenew.replace('"', '')
file = open('1992DemocraticPrimaryElection.'+linenew+'.csv', 'w')


data = urllib.parse.urlencode(values)
data = data.encode('ascii')

req = urllib.request.Request(url, data)

response = urllib.request.urlopen(req)

html_bytes = response.read()
html = str(html_bytes)
print(html)
dataTable = parseTable(html)
writer = csv.writer(file)
for item in dataTable:
    writer.writerow(item)
file.close()

values = {'election' : "1992 Democratic Primary Election",
'lboRace' : "",
'btnSubmit' : "Submit"}
files = open('1992DemocraticPrimaryElection.txt', 'r')
for line in files:
for k, v in values.items():
values['lboRace'] = line
print(k, v)
scrape(values, line)

files.close()