Gribouillis 1,391 Programming Explorer Team Colleague

The following program is able to download the python programs contained in a thread of the python forum. Just start the program, it will prompt you for the thread number and create a directory with the code extracted from the thread. I used it to download all the wx examples. Note that the algorithm is very primitive: only the code written in language tags is extracted (raw code is not), and data attached to posts is not downloaded. Also it's not robust, if the formatting in daniweb changes tomorrow, it won't work anymore :)

#!/usr/bin/env python
# danidown.py
from htmllib import HTMLParser 
from formatter import AbstractFormatter ,AbstractWriter ,NullWriter 
import re 
from os import mkdir 
from os .path import isdir ,join as pjoin 
from urllib2 import urlopen 

class aThread (object ):
  itsCntPattern =re .compile (r"Page \d+ of (\d+)")
  def __init__ (o ,theThreadNumber ):
    o .itsNumber =theThreadNumber 
    o ._itsPageCnt =None 
    o .itsReply =0 
    o .itsCodeIndex =0 
  def itsUrl (o ,thePage =1 ):
    x =""if (thePage ==1 )else "-%d"%thePage 
    return "http://www.daniweb.com/forums/thread%d%s.html"%(
    o .itsNumber ,x )
  def itsContent (o ,thePage =1 ):
    theUrl =o .itsUrl (thePage )
    f =urlopen (theUrl )
    s =f .read ()
    f .close ()
    return s 
  @property 
  def itsPageCnt (o ):
    if o ._itsPageCnt is None :
      o ._itsPageCnt =1 
      theContent =o .itsContent (1 )
      theMatch =o .itsCntPattern .search (theContent )
      if theMatch is not None :
        o ._itsPageCnt =int (theMatch .group (1 ))
    return o ._itsPageCnt 
  @property 
  def itsTriples (o ):
    theCnt =o .itsPageCnt 
    printMessage ("The thread contains %d pages..."%theCnt )
    for i in xrange (1 ,theCnt +1 ):
      printMessage ("Page %d..."%i )
      theWriter =Writer1 ()
      theParser =HTMLParser (AbstractFormatter (theWriter ))
      theContent =o .itsContent (i )
      theParser .feed (theContent )
      theParser .close ()
      for theTriple in theWriter .itsTriples :
        yield theTriple 

  @property 
  def itsFolder (o ):
    return "thread%d"%o .itsNumber 

  def itsReplyFolder (o ,n ):
    return pjoin (o .itsFolder ,"reply%d"%n )

  def doDownload (o ):
    for theReply ,theAuthor ,theCode in o .itsTriples :
      theFolder =o .itsReplyFolder (theReply )
      if theReply >o .itsReply :
        printMessage ("reply %d..."%theReply )
        if o .itsReply ==0 :
          if not isdir (o .itsFolder ):
            mkdir (o .itsFolder )
        o .itsReply =theReply 
        o .itsCodeIndex =0 
        if not isdir (theFolder ):
          mkdir (theFolder )
        f =open (pjoin (theFolder ,"author"),"w")
        f .write (theAuthor +"\n")
        f .close ()
      o .itsCodeIndex +=1 
      f =open (pjoin (theFolder ,"prog%d.py"%o .itsCodeIndex ),"w")
      f .write (theCode )
      f .close ()
    print "done."

class Writer1 (NullWriter ):
  def __init__ (o ):
    NullWriter .__init__ (o )
    o .isInCode =False 
    o .itsCode =None 
    o .itsAuthor ="unknown"
    o .itsAnswer =0 
    o .justReadAuthor =False 
    o .nextIsAuthor =False 
    o .nextIsNumber =False 
    o .itsTriples =[]
  def send_label_data (o ,data ):
#print "send_label_data(%s)" % repr(data)
    if o .isInCode :
      assert (data [-1 ]==".")
      n =int (data [:-1 ])
      o .itsCode .append ([])
      assert (len (o .itsCode )==n )
    elif data =="1.":
      o .isInCode =True 
      o .itsCode =[[]]
  def send_literal_data (o ,data ):
#print "send_literal_data(%s)" % repr(data)
    if o .isInCode and data !="\xa0":
      o .itsCode [-1 ].append (data )
  def send_line_break (o ):
    if o .isInCode :
      o .itsCode [-1 ]="".join (o .itsCode [-1 ])
  def send_paragraph (o ,data ):
#print "send_paragraph(%s)" % repr(data)
    if o .isInCode :
      theCode ="\n".join (o .itsCode )
      o .itsCode =None 
      o .isInCode =False 
      o .itsTriples .append ((o .itsAnswer ,o .itsAuthor ,theCode ))
  def send_flowing_data (o ,data ):
    if o .nextIsNumber :
      o .itsAnswer =int (data )
      o .nextIsNumber =False 
    elif o .justReadAuthor :
      if data ==" #":
        o .nextIsNumber =True 
        o .justReadAuthor =False 
    elif o .nextIsAuthor :
      o .itsAuthor =data 
      o .justReadAuthor =True 
      o .nextIsAuthor =False 
    elif data .startswith (" Solved Threads:"):
      o .nextIsAuthor =True 

def printMessage (msg ):
  print msg 

if __name__ =="__main__":
  n =int (raw_input ("Enter thread number: "))
  theThread =aThread (n )
  theThread .doDownload ()
Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.