I am trying to get some data off a Brazilian government website.

The data is accessible through a form with some javascript. I am able to get the form and fill it out, but have trouble submitting it (a button needs to be clicked). I am using the library mechanize (which includes clientform) but of course would be happy to try others.

Below is the website and the code so far. Any help or pointers would be highly appreciated.

Here is the website:
http://www.tse.gov.br/spce2008DivHtml/pesquisaCandidato.jsp

And here is the my code so far:

import mechanize, urllib, urllib2

# Start Browser
br = mechanize.Browser(factory=mechanize.RobustFactory())

# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT
6.0; en-US; rv:1.9.0.6')]
br.open('http://www.tse.gov.br/spce2008DivHtml/pesquisaCandidato.jsp?sgUf=AC')
 # already choose state here for now
html = br.response().read()
# print html

# Select the form
br.select_form(nr=0)  # since there is only 1 form on the site
print br.form

# get all values for states and munis
# to eventually loop over them
states = br.form.possible_items("sgUf")
munis  = br.form.possible_items("sgUe")

# Enter info in web form
br.form.set_all_readonly(False)             # make all form items changeable
#br.form['acao'] = 'Pesquisar'              # send action "pesquisar"??
br.form.set(True, states[1] , "sgUf")       # state
br.form.set(True, munis[1] , "sgUe")        # municipality
br.form.set(True, "11" , "candidatura")     # post (prefeito-11 or vereador-13)
br.form.set(True, "2" , "parcial")          # parcial 1 or 2 (choose 2)

# Submit the form  -- does not work yet
request2 = br.form.click()
#request2 = br.submit()

try:
   response2 = urllib2.urlopen(request2)
except urllib2.HTTPError, response2:
   pass
print response2.geturl()
print response2.info()  # headers
print response2.read()  # body

Thanks for any help in advance!
-Thomas

Recommended Answers

All 2 Replies

I am trying to get some data off a Brazilian government website.

The data is accessible through a form with some javascript. I am able to get the form and fill it out, but have trouble submitting it (a button needs to be clicked). I am using the library mechanize (which includes clientform) but of course would be happy to try others.

Below is the website and the code so far. Any help or pointers would be highly appreciated.

Here is the website:
http://www.tse.gov.br/spce2008DivHtml/pesquisaCandidato.jsp

And here is the my code so far:

import mechanize, urllib, urllib2

# Start Browser
br = mechanize.Browser(factory=mechanize.RobustFactory())

# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT
6.0; en-US; rv:1.9.0.6')]
br.open('http://www.tse.gov.br/spce2008DivHtml/pesquisaCandidato.jsp?sgUf=AC')
 # already choose state here for now
html = br.response().read()
# print html

# Select the form
br.select_form(nr=0)  # since there is only 1 form on the site
print br.form

# get all values for states and munis
# to eventually loop over them
states = br.form.possible_items("sgUf")
munis  = br.form.possible_items("sgUe")

# Enter info in web form
br.form.set_all_readonly(False)             # make all form items changeable
#br.form['acao'] = 'Pesquisar'              # send action "pesquisar"??
br.form.set(True, states[1] , "sgUf")       # state
br.form.set(True, munis[1] , "sgUe")        # municipality
br.form.set(True, "11" , "candidatura")     # post (prefeito-11 or vereador-13)
br.form.set(True, "2" , "parcial")          # parcial 1 or 2 (choose 2)

# Submit the form  -- does not work yet
request2 = br.form.click()
#request2 = br.submit()

try:
   response2 = urllib2.urlopen(request2)
except urllib2.HTTPError, response2:
   pass
print response2.geturl()
print response2.info()  # headers
print response2.read()  # body

Thanks for any help in advance!
-Thomas

Never used that module, but this is how I post back data

import urllib, urllib2, cookielib
 
class WebForm:
    def __init__(self):
        pass
    def Opener(self,ref):
        """Creats an opener to store cookies,
        and keep a referer to the site
        Added user-agent to spoof browser"""
        self.refrence = ref
        cj = cookielib.CookieJar()
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        self.opener.addheaders.append(('User-agent', 'Mozilla/4.0'))
        self.opener.addheaders.append(('Referer',ref))
        return self.opener
 
    def GET(self,opnr,url):
        """HBH GET method, notice to data option"""
        get_req = opnr.open(url)
        return get_req.read()
 
    def POST(self,opnr,url,data):
        """data is a dictinary type like login_data"""
        enData = urllib.urlencode(data)
        get_req = opnr.open(url,enData)
        return get_req.read()
 
def main():
    #Example
    s = WebForm()

    postData = urllib.urlencode({'entryName':'value','submit':'Check'})

    url_open = s.Opener('http://www.somesite.com/index.html')
    request = s.POST(url_open,'http://www.somesite.com/index.html', postData)
    f = open('LogInTest.html','w')
    f.write(request)
 
if __name__ == "__main__":
    main()

The postData came from looking at the source of a page.
example:

<form id='loginform' method='post' action='index.php'>
<div style="text-align: center;">
Username<br />
<input type='text' name='user_name' class='textbox' style='width:100px' /><br />
Password<br />

<input type='password' name='user_pass' class='textbox' style='width:100px' /><br />
<input type='checkbox' name='remember_me' value='y' />Remember Me<br /><br />
<input type='submit' name='login' value='Login' class='button' /><br />

its postData would look like:

postData = urllib.urlencode({'user_name':'MyNickName','user_pass':'MyPassword','login':'Login'})

It looks like the browser emulation is overkill. In the end, the following script worked:

Python code: Using mechanize

import mechanize, urllib, urllib2

# Start Browser
br = mechanize.Browser(factory=mechanize.RobustFactory())

# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6')]
br.open('http://www.tse.gov.br/spce2008DivHtml/pesquisaCandidato.jsp')
# already choose state here for now

# Select the form
br.select_form(nr=0)  # since there is only 1 form on the site

# get all values for states and munis
# to eventually loop over them
states = br.form.possible_items("sgUf")[1:]

params = dict(acao='pesquisar', dsNrTituloEleitor='dsNrTituloEleitor')

for state in states:
  br.open('http://www.tse.gov.br/spce2008DivHtml/pesquisaCandidato.jsp?sgUf='+state)
  br.select_form(nr=0)
  munis  = br.form.possible_items("sgUe")[1:]
  for muni in munis:
     params['sgUf'] = state
     params['sgUe'] = muni
     params['candidatura'] = "11"
     params['parcial'] = "2"
     try:
        response2 = urllib2.urlopen('http://www.tse.gov.br/spce2008DivHtml/candidatoServlet.do', urllib.urlencode(params))
     except urllib2.HTTPError, response2:
        pass
     print response2.geturl()
     print response2.info()  # headers
     print response2.read()  # body

Thank you for the reponse. Problem solved.

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.