In the line 72 of the code i do a findAll to retrieve all 'a' tags that have a 'horariosCarteleraUnderline' class and that have an href url that contains ?ic=[code]& where code is a common code used to identifie the movie start time.

It should retrieve all movie times, but it forgets about ones.

This is the full code:

from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
import re

class cuapi():
  def __init__(self):
    self.url = ""
    self.urlCartelera = ""
    self.citiesid = "ctl00_ddlCiudad"
    self.soup = 0
    self.currentUrl = ''
  def cureHTML(self):
    #print "occurrence found !"
    return 'target="_blank"'

  def getSoup(self, url):
    '''opens the url given and using the html in it makes a soup, this soup is
    returned as a beautifulsoup object'''
    if url != self.currentUrl:
      print "new url %s" % url
      self.currentUrl = url
      page = urlopen(url)
      html =
      myMassage = [(re.compile(r'target\"_blank\"'), self.cureHTML())]
      self.soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=myMassage)
    return self.soup

  def getCities(self):
    '''Returns a dictionary, where the cities names are the keys, and the value
    is a city code used to display the correct schedule and movie listing'''
    soup = self.getSoup(self.url)
    citiesCombo = soup.findAll('select',{'id':self.citiesid})[0]
    cities = citiesCombo.findAll("option")
    citiesDict = {}
    for city in cities:
      name = city.contents[0]
      value = city.attrs[0][1]
      citiesDict[name] = value 
      #print "Agregada la ciudad de %s con clave %s " % (name,value)
    for city in sorted(citiesDict.keys()):
      print city
    return citiesDict

  def getComplejos(self, cityCode='43'):
    '''give a city code and it will return a dictionary where the complejos are
    the keys and the value is an ID created to distinguish between them'''
    #page = urlopen(self.urlCartelera + str(cityCode))
    soup = self.getSoup(self.urlCartelera + str(cityCode))
    complejos = soup.findAll('span',{'class':'TitulosBlanco'})
    compDict = {}

    n = 0
    for comp in complejos:
      name = comp.contents[0]
      compDict[name] = n
      #print "Complejo %s agregado áé" % name
      n += 1
    print compDict
    return compDict

  def getHorarios(self,cityCode='43', compCode='152'):
    #TODO: get html
    soup = self.getSoup(self.urlCartelera + str(cityCode))
    movies = soup.findAll('a',{'class':'peliculaCartelera'})
    moviesDict = {}
    for movie in movies:
      #qp es la pelicula en cuestion, aqui deberias iterar
      name = movie.contents[0]
      #print name
      papi = movie.findParent().findParent().findParent()
      regex = re.compile("\?ic=%s&" % compCode)
      hor = papi.findAll('a',{'class':'horariosCarteleraUnderline','href':regex})
      #print len(hor)
      if len(hor) > 0:
        moviesDict[name] = [h.string for h in hor]
        print name, moviesDict[name]
    #for mov in moviesDict.keys():
      #print mov,moviesDict[mov]

c = cuapi()

The first movie is ok, but the second one, that only returns 4 times, should be returning 5. It tends to avoid 1 or 2 of the firts.


new url [url][/url]
¿Qué Pasó Ayer? 2 Dob [u'5:45pm', u'7:50pm', u'9:55pm']
<---- this down here should have 5 ---->
El Defensor [u'3:00pm', u'5:25pm', u'8:00pm', u'10:30pm']
Kung Fu Panda 2 Dig 3D Dob [u'4:00pm', u'6:10pm', u'8:20pm', u'10:25pm']
Kung Fu Panda 2 Dob [u'3:10pm', u'5:10pm', u'7:20pm', u'9:20pm']
La Chica de la Capa Roja [u'4:50pm', u'6:55pm']
La Noche del Demonio [u'3:50pm', u'6:00pm', u'8:10pm', u'10:20pm']
Nunca Me Abandones [u'4:10pm', u'6:20pm', u'8:30pm', u'10:40pm']
Piratas del Caribe 4 Dob [u'2:45pm']
X-Men: Primera Generación  Dob [u'4:30pm', u'7:20pm', u'10:05pm']
X-Men: Primera Generación  Sub [u'9:00pm']

I was wrong, i was comparing my results with a page that was loaded this morning. So of course my results will differ.

So here's an advice to everyone starting with web apps, always refresh page when doing a comparison.


This question has already been answered. Start a new discussion instead.