| | |
Downloading thread code.
![]() |
The following program is able to download the python programs contained in a thread of the python forum. Just start the program, it will prompt you for the thread number and create a directory with the code extracted from the thread. I used it to download all the wx examples. Note that the algorithm is very primitive: only the code written in language tags is extracted (raw code is not), and data attached to posts is not downloaded. Also it's not robust, if the formatting in daniweb changes tomorrow, it won't work anymore 

python Syntax (Toggle Plain Text)
#!/usr/bin/env python # danidown.py from htmllib import HTMLParser from formatter import AbstractFormatter ,AbstractWriter ,NullWriter import re from os import mkdir from os .path import isdir ,join as pjoin from urllib2 import urlopen class aThread (object ): itsCntPattern =re .compile (r"Page \d+ of (\d+)") def __init__ (o ,theThreadNumber ): o .itsNumber =theThreadNumber o ._itsPageCnt =None o .itsReply =0 o .itsCodeIndex =0 def itsUrl (o ,thePage =1 ): x =""if (thePage ==1 )else "-%d"%thePage return "http://www.daniweb.com/forums/thread%d%s.html"%( o .itsNumber ,x ) def itsContent (o ,thePage =1 ): theUrl =o .itsUrl (thePage ) f =urlopen (theUrl ) s =f .read () f .close () return s @property def itsPageCnt (o ): if o ._itsPageCnt is None : o ._itsPageCnt =1 theContent =o .itsContent (1 ) theMatch =o .itsCntPattern .search (theContent ) if theMatch is not None : o ._itsPageCnt =int (theMatch .group (1 )) return o ._itsPageCnt @property def itsTriples (o ): theCnt =o .itsPageCnt printMessage ("The thread contains %d pages..."%theCnt ) for i in xrange (1 ,theCnt +1 ): printMessage ("Page %d..."%i ) theWriter =Writer1 () theParser =HTMLParser (AbstractFormatter (theWriter )) theContent =o .itsContent (i ) theParser .feed (theContent ) theParser .close () for theTriple in theWriter .itsTriples : yield theTriple @property def itsFolder (o ): return "thread%d"%o .itsNumber def itsReplyFolder (o ,n ): return pjoin (o .itsFolder ,"reply%d"%n ) def doDownload (o ): for theReply ,theAuthor ,theCode in o .itsTriples : theFolder =o .itsReplyFolder (theReply ) if theReply >o .itsReply : printMessage ("reply %d..."%theReply ) if o .itsReply ==0 : if not isdir (o .itsFolder ): mkdir (o .itsFolder ) o .itsReply =theReply o .itsCodeIndex =0 if not isdir (theFolder ): mkdir (theFolder ) f =open (pjoin (theFolder ,"author"),"w") f .write (theAuthor +"\n") f .close () o .itsCodeIndex +=1 f =open (pjoin (theFolder ,"prog%d.py"%o .itsCodeIndex ),"w") f .write (theCode ) f .close () print "done." class Writer1 (NullWriter ): def __init__ (o ): NullWriter .__init__ (o ) o .isInCode =False o .itsCode =None o .itsAuthor ="unknown" o .itsAnswer =0 o .justReadAuthor =False o .nextIsAuthor =False o .nextIsNumber =False o .itsTriples =[] def send_label_data (o ,data ): #print "send_label_data(%s)" % repr(data) if o .isInCode : assert (data [-1 ]==".") n =int (data [:-1 ]) o .itsCode .append ([]) assert (len (o .itsCode )==n ) elif data =="1.": o .isInCode =True o .itsCode =[[]] def send_literal_data (o ,data ): #print "send_literal_data(%s)" % repr(data) if o .isInCode and data !="\xa0": o .itsCode [-1 ].append (data ) def send_line_break (o ): if o .isInCode : o .itsCode [-1 ]="".join (o .itsCode [-1 ]) def send_paragraph (o ,data ): #print "send_paragraph(%s)" % repr(data) if o .isInCode : theCode ="\n".join (o .itsCode ) o .itsCode =None o .isInCode =False o .itsTriples .append ((o .itsAnswer ,o .itsAuthor ,theCode )) def send_flowing_data (o ,data ): if o .nextIsNumber : o .itsAnswer =int (data ) o .nextIsNumber =False elif o .justReadAuthor : if data ==" #": o .nextIsNumber =True o .justReadAuthor =False elif o .nextIsAuthor : o .itsAuthor =data o .justReadAuthor =True o .nextIsAuthor =False elif data .startswith (" Solved Threads:"): o .nextIsAuthor =True def printMessage (msg ): print msg if __name__ =="__main__": n =int (raw_input ("Enter thread number: ")) theThread =aThread (n ) theThread .doDownload ()
Last edited by Gribouillis; Nov 30th, 2008 at 1:24 pm.
![]() |
Similar Threads
- Softwares worth downloading (Windows NT / 2000 / XP)
- Error loading cmicnfg.cpl -- SOLUTION HERE! (Windows NT / 2000 / XP)
- A multilingual coded Hello World! thread (Legacy and Other Languages)
- Installing GTK and MinGW in Windows (C)
- Fake IP on torrent network? (Network Security)
- updating a JList (Java)
- downloading a file (C++)
- [Revised] vBulletin Mod_rewrite Tutorial (PHP)
Other Threads in the Python Forum
- Previous Thread: Python Multimedia Framework
- Next Thread: ListCtrl - Get All selected Items
| Thread Tools | Search this Thread |
abrupt ansi anti apache approximation array assignment avogadro backend beginner binary bluetooth book builtin calculator character code converter countpasswordentry curved customdialog dan08 dictionaries dictionary dynamic examples exe file float format function gnu graphics gui heads homework ideas import inches input java launcher library line lines linux list lists loop mouse mysqlquery number numbers numeric output parsing path phonebook plugin pointer port prime programming progressbar projects py2exe pygame python random recursion redirect scrolledtext software statictext statistics string strings sum table terminal text textarea thread threading time tlapse trick tricks tuple tutorial twoup ubuntu unicode urllib urllib2 variable wordgame write wxpython xlib





