| | |
Downloading thread code.
![]() |
The following program is able to download the python programs contained in a thread of the python forum. Just start the program, it will prompt you for the thread number and create a directory with the code extracted from the thread. I used it to download all the wx examples. Note that the algorithm is very primitive: only the code written in language tags is extracted (raw code is not), and data attached to posts is not downloaded. Also it's not robust, if the formatting in daniweb changes tomorrow, it won't work anymore 

python Syntax (Toggle Plain Text)
#!/usr/bin/env python # danidown.py from htmllib import HTMLParser from formatter import AbstractFormatter ,AbstractWriter ,NullWriter import re from os import mkdir from os .path import isdir ,join as pjoin from urllib2 import urlopen class aThread (object ): itsCntPattern =re .compile (r"Page \d+ of (\d+)") def __init__ (o ,theThreadNumber ): o .itsNumber =theThreadNumber o ._itsPageCnt =None o .itsReply =0 o .itsCodeIndex =0 def itsUrl (o ,thePage =1 ): x =""if (thePage ==1 )else "-%d"%thePage return "http://www.daniweb.com/forums/thread%d%s.html"%( o .itsNumber ,x ) def itsContent (o ,thePage =1 ): theUrl =o .itsUrl (thePage ) f =urlopen (theUrl ) s =f .read () f .close () return s @property def itsPageCnt (o ): if o ._itsPageCnt is None : o ._itsPageCnt =1 theContent =o .itsContent (1 ) theMatch =o .itsCntPattern .search (theContent ) if theMatch is not None : o ._itsPageCnt =int (theMatch .group (1 )) return o ._itsPageCnt @property def itsTriples (o ): theCnt =o .itsPageCnt printMessage ("The thread contains %d pages..."%theCnt ) for i in xrange (1 ,theCnt +1 ): printMessage ("Page %d..."%i ) theWriter =Writer1 () theParser =HTMLParser (AbstractFormatter (theWriter )) theContent =o .itsContent (i ) theParser .feed (theContent ) theParser .close () for theTriple in theWriter .itsTriples : yield theTriple @property def itsFolder (o ): return "thread%d"%o .itsNumber def itsReplyFolder (o ,n ): return pjoin (o .itsFolder ,"reply%d"%n ) def doDownload (o ): for theReply ,theAuthor ,theCode in o .itsTriples : theFolder =o .itsReplyFolder (theReply ) if theReply >o .itsReply : printMessage ("reply %d..."%theReply ) if o .itsReply ==0 : if not isdir (o .itsFolder ): mkdir (o .itsFolder ) o .itsReply =theReply o .itsCodeIndex =0 if not isdir (theFolder ): mkdir (theFolder ) f =open (pjoin (theFolder ,"author"),"w") f .write (theAuthor +"\n") f .close () o .itsCodeIndex +=1 f =open (pjoin (theFolder ,"prog%d.py"%o .itsCodeIndex ),"w") f .write (theCode ) f .close () print "done." class Writer1 (NullWriter ): def __init__ (o ): NullWriter .__init__ (o ) o .isInCode =False o .itsCode =None o .itsAuthor ="unknown" o .itsAnswer =0 o .justReadAuthor =False o .nextIsAuthor =False o .nextIsNumber =False o .itsTriples =[] def send_label_data (o ,data ): #print "send_label_data(%s)" % repr(data) if o .isInCode : assert (data [-1 ]==".") n =int (data [:-1 ]) o .itsCode .append ([]) assert (len (o .itsCode )==n ) elif data =="1.": o .isInCode =True o .itsCode =[[]] def send_literal_data (o ,data ): #print "send_literal_data(%s)" % repr(data) if o .isInCode and data !="\xa0": o .itsCode [-1 ].append (data ) def send_line_break (o ): if o .isInCode : o .itsCode [-1 ]="".join (o .itsCode [-1 ]) def send_paragraph (o ,data ): #print "send_paragraph(%s)" % repr(data) if o .isInCode : theCode ="\n".join (o .itsCode ) o .itsCode =None o .isInCode =False o .itsTriples .append ((o .itsAnswer ,o .itsAuthor ,theCode )) def send_flowing_data (o ,data ): if o .nextIsNumber : o .itsAnswer =int (data ) o .nextIsNumber =False elif o .justReadAuthor : if data ==" #": o .nextIsNumber =True o .justReadAuthor =False elif o .nextIsAuthor : o .itsAuthor =data o .justReadAuthor =True o .nextIsAuthor =False elif data .startswith (" Solved Threads:"): o .nextIsAuthor =True def printMessage (msg ): print msg if __name__ =="__main__": n =int (raw_input ("Enter thread number: ")) theThread =aThread (n ) theThread .doDownload ()
Last edited by Gribouillis; Nov 30th, 2008 at 1:24 pm.
![]() |
Similar Threads
- Softwares worth downloading (Windows NT / 2000 / XP)
- Error loading cmicnfg.cpl -- SOLUTION HERE! (Windows NT / 2000 / XP)
- A multilingual coded Hello World! thread (Legacy and Other Languages)
- Installing GTK and MinGW in Windows (C)
- Fake IP on torrent network? (Network Security)
- updating a JList (Java)
- downloading a file (C++)
- [Revised] vBulletin Mod_rewrite Tutorial (PHP)
Other Threads in the Python Forum
- Previous Thread: Python Multimedia Framework
- Next Thread: ListCtrl - Get All selected Items
| Thread Tools | Search this Thread |
abrupt accessdenied advanced ansi anti apache application approximation argv array backend beginner binary builtin calculator change command converter countpasswordentry csv curved dan08 def dictionary edit event file float format function google heads homework inches input jaunty java keyboard lapse library line lines linux list lists loop microphone mouse movingimageswithpygame mysqlquery newb number numbers numeric obexftp output parameters parsing path phonebook pointer prime programming py2exe pygame pyopengl python random recursion redirect remote return reverse scrolledtext session software sprite statictext statistics string strings syntax terminal text thread threading time tlapse tuple twoup ubuntu unicode unit urllib urllib2 variable voip wordgame write wxpython





