DaniWeb IT Discussion Community

DaniWeb IT Discussion Community (http://www.daniweb.com/forums/index.php)
-   Python (http://www.daniweb.com/forums/forum114.html)
-   -   Downloading thread code. (http://www.daniweb.com/forums/thread160116.html)

Gribouillis Nov 30th, 2008 1:23 pm
Downloading thread code.
 
The following program is able to download the python programs contained in a thread of the python forum. Just start the program, it will prompt you for the thread number and create a directory with the code extracted from the thread. I used it to download all the wx examples. Note that the algorithm is very primitive: only the code written in language tags is extracted (raw code is not), and data attached to posts is not downloaded. Also it's not robust, if the formatting in daniweb changes tomorrow, it won't work anymore :)
#!/usr/bin/env python
# danidown.py
from htmllib import HTMLParser
from formatter import AbstractFormatter ,AbstractWriter ,NullWriter
import re
from os import mkdir
from os .path import isdir ,join as pjoin
from urllib2 import urlopen

class aThread (object ):
  itsCntPattern =re .compile (r"Page \d+ of (\d+)")
  def __init__ (o ,theThreadNumber ):
    o .itsNumber =theThreadNumber
    o ._itsPageCnt =None
    o .itsReply =0
    o .itsCodeIndex =0
  def itsUrl (o ,thePage =1 ):
    x =""if (thePage ==1 )else "-%d"%thePage
    return "http://www.daniweb.com/forums/thread%d%s.html"%(
    o .itsNumber ,x )
  def itsContent (o ,thePage =1 ):
    theUrl =o .itsUrl (thePage )
    f =urlopen (theUrl )
    s =f .read ()
    f .close ()
    return s
  @property
  def itsPageCnt (o ):
    if o ._itsPageCnt is None :
      o ._itsPageCnt =1
      theContent =o .itsContent (1 )
      theMatch =o .itsCntPattern .search (theContent )
      if theMatch is not None :
        o ._itsPageCnt =int (theMatch .group (1 ))
    return o ._itsPageCnt
  @property
  def itsTriples (o ):
    theCnt =o .itsPageCnt
    printMessage ("The thread contains %d pages..."%theCnt )
    for i in xrange (1 ,theCnt +1 ):
      printMessage ("Page %d..."%i )
      theWriter =Writer1 ()
      theParser =HTMLParser (AbstractFormatter (theWriter ))
      theContent =o .itsContent (i )
      theParser .feed (theContent )
      theParser .close ()
      for theTriple in theWriter .itsTriples :
        yield theTriple

  @property
  def itsFolder (o ):
    return "thread%d"%o .itsNumber

  def itsReplyFolder (o ,n ):
    return pjoin (o .itsFolder ,"reply%d"%n )

  def doDownload (o ):
    for theReply ,theAuthor ,theCode in o .itsTriples :
      theFolder =o .itsReplyFolder (theReply )
      if theReply >o .itsReply :
        printMessage ("reply %d..."%theReply )
        if o .itsReply ==0 :
          if not isdir (o .itsFolder ):
            mkdir (o .itsFolder )
        o .itsReply =theReply
        o .itsCodeIndex =0
        if not isdir (theFolder ):
          mkdir (theFolder )
        f =open (pjoin (theFolder ,"author"),"w")
        f .write (theAuthor +"\n")
        f .close ()
      o .itsCodeIndex +=1
      f =open (pjoin (theFolder ,"prog%d.py"%o .itsCodeIndex ),"w")
      f .write (theCode )
      f .close ()
    print "done."

class Writer1 (NullWriter ):
  def __init__ (o ):
    NullWriter .__init__ (o )
    o .isInCode =False
    o .itsCode =None
    o .itsAuthor ="unknown"
    o .itsAnswer =0
    o .justReadAuthor =False
    o .nextIsAuthor =False
    o .nextIsNumber =False
    o .itsTriples =[]
  def send_label_data (o ,data ):
#print "send_label_data(%s)" % repr(data)
    if o .isInCode :
      assert (data [-1 ]==".")
      n =int (data [:-1 ])
      o .itsCode .append ([])
      assert (len (o .itsCode )==n )
    elif data =="1.":
      o .isInCode =True
      o .itsCode =[[]]
  def send_literal_data (o ,data ):
#print "send_literal_data(%s)" % repr(data)
    if o .isInCode and data !="\xa0":
      o .itsCode [-1 ].append (data )
  def send_line_break (o ):
    if o .isInCode :
      o .itsCode [-1 ]="".join (o .itsCode [-1 ])
  def send_paragraph (o ,data ):
#print "send_paragraph(%s)" % repr(data)
    if o .isInCode :
      theCode ="\n".join (o .itsCode )
      o .itsCode =None
      o .isInCode =False
      o .itsTriples .append ((o .itsAnswer ,o .itsAuthor ,theCode ))
  def send_flowing_data (o ,data ):
    if o .nextIsNumber :
      o .itsAnswer =int (data )
      o .nextIsNumber =False
    elif o .justReadAuthor :
      if data ==" #":
        o .nextIsNumber =True
        o .justReadAuthor =False
    elif o .nextIsAuthor :
      o .itsAuthor =data
      o .justReadAuthor =True
      o .nextIsAuthor =False
    elif data .startswith (" Solved Threads:"):
      o .nextIsAuthor =True

def printMessage (msg ):
  print msg

if __name__ =="__main__":
  n =int (raw_input ("Enter thread number: "))
  theThread =aThread (n )
  theThread .doDownload ()


All times are GMT -4. The time now is 12:08 pm.

Forum system based on vBulletin Copyright ©2000 - 2009, Jelsoft Enterprises Ltd.
©2003 - 2009 DaniWeb® LLC