Urllib2 help

natehome 0 Newbie Poster

12 Years Ago

hi
im trying to make a program that will go to 4chan and download a the images on a thread(i.e. http://4chan.org/b). the program will work the first time but after that when i go to run it again it trys to download the same urls as it did the first time it ran, and those have 404'ed. please help. also 4chan is NSFW (though most of you probably know about it, but just in case.

Edit:
i just tested the code on 4chan.org/s and it works perfectly. so i dont know why its not working on 4chan.org/b

heres my code:

import os
import sys
import urllib
import urllib2

def geturl(url, dst):
##    dst = str('chan/')+str(dst)
##    pic = urllib2.urlopen(url)
##    output = open(dst,'wb')
##    output.write(pic.read())
##    output.close()


    #print "get url '%s' to '%s'" % (url, dst)
    if sys.stdout.isatty():
        dst = str('chan/')+str(dst)
        urllib.urlretrieve(url, dst)
        base = os.path.basename(url)
    else:
        urllib.urlretrieve(url, dst)





url = 'http://www.4chan.org/s'
f = urllib2.urlopen(url)
siteinfo = []
for lines in f:
    lines = lines.replace('\n','')
    siteinfo.append(lines)
#look for symbol
temp = 0
urls = []
image = None
imagename=[]
base_url=[]
while temp<len(siteinfo):
    try:
        if '.jpg' in siteinfo[temp]:
            image = str(siteinfo[temp])
            temp1 = image.index('http')
            image = image[temp1:]
            temp1 = image.rindex('jpg')
            temp1+=3
            image = image[:temp1]
            if " " in image:
                try:
                    temp1 = image.index(' ')
                    image = image[:temp1]
                except:
                    pass
            if ".jpg\"" in image:
                try:
                    temp1 = image.index("\"")
                    image = image[:temp1]
                except:
                    pass

            urls.append(image)

            temp5 = image.rindex('/')
            temp5+=1
            temp5 = image[temp5:]
            imagename.append(temp5)


    except:
            pass

    temp += 1





    
temp9 = 0
while temp9 < len(urls):
    tempa = urls[temp9]
    temp1 = tempa[:tempa.rindex('/')+1]
    base_url.append(temp1)
    temp9 += 1

temp = 0
for i in urls:
    print i
    if '.jpg' in i:
        print 'good'
    else:
        del urls[temp]
        del imagename[temp]
        del base_url[temp]
    temp+=1
        

folder_files=[]
path="chan"
dirList=os.listdir(path)
for fname in dirList:
    folder_files.append(fname)
temp = 0
while temp<imagename:
    try:
        file_name = imagename[temp]
        if file_name not in imagename[temp+1:]:
            #if file_name not in folder_files:
            url = base_url[temp]
            full_url = str(url)+str(file_name)
            geturl(full_url, file_name)
            print'Downloaded:', file_name

        temp+=1
    except:
        break
print'\nDownloads Complete'

f.close()

Edited 12 Years Ago by natehome because: n/a

1 Contributor
0 Replies
190 Views

Be the first to reply

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.