Monday, December 28, 2009

How to download images from craigslist

Code

from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen

site = "http://sfbay.craigslist.org/rea/"
table = BeautifulSoup(urlopen(site))
items = table('p')
linkdict = {}
i = 0
for item in items[:-1]:
i=i+1
itempostlink = item('a')[0]['href']
itemlink = site[:-5] + itempostlink
linkdict[i] = itemlink

for i, link in linkdict.iteritems():
print i, linkdict[i]
images = BeautifulSoup(urlopen(linkdict[i]))('img')
imagecounter = 0
for image in images:
imagecounter +=1
image_url =image['src']
print image_url
image = urlopen(image_url)
local = open("c:\\temp\\"+str(i)+"."+str(imagecounter)+".jpg",'wb')
local.write(image.read())
local.close()
print

How to get BeautifulSoup to filter Craigslist HTML

Code

from BeautifulSoup import BeautifulSoup #1
from urllib2 import urlopen #2

site = "http://sfbay.craigslist.org/rea/" #3
table = BeautifulSoup(urlopen(site)) #4
items = table('p') #5
for item in items[:-1]: #6
itempostlink = item('a')[0]['href'] #7
itemlink = site[:-5] + itempostlink #8
print itemlink #9
soup = BeautifulSoup(urlopen(itemlink)) #10
body = soup('div',{'id':"userbody"})[0].contents[0] #11
print body #12


How it works - Line by line
from BeautifulSoup import BeautifulSoup                       #1
from urllib2 import urlopen #2

site = "http://sfbay.craigslist.org/rea/" #3
table = BeautifulSoup(urlopen(site)) #4
items = table('p') #5

Please see my postings on how to screen scrape craigslist for details on lines #1-#5


for item in items[:-1]:                                       #6

The last item of this resultset is disregarded as it is not a posting.

    itempostlink = item('a')[0]['href']                       #7

This will get the link for each item from the href tag

    itemlink = site[:-5] + itempostlink                       #8

This appends #7 to the truncated base site link

    print itemlink                                            #9

This displays the full link to the craigslist item

    soup = BeautifulSoup(urlopen(itemlink))                   #10

This loads the craigslist item link into BeautifulSoup

    body = soup('div',{'id':"userbody"})[0].contents[0]       #11

This extracts the body contents from the item link html

    print body                                                #12

This prints the body contents

How to get the list of files in a ftp directory

Code

from ftplib import FTP #1
ftp = FTP(host='ftp.bls.gov') #2
ftp.login() #3
ftp.cwd('pub/time.series/ap') #4
print ftp.nlst() #5


How this works, line by line
from ftplib import FTP            #1

This imports FTP from the standard ftplib module

ftp = FTP(host='ftp.bls.gov')     #2

This defines lowercase ftp as a FTP connection to the host

ftp.login()                       #3

This statement logs in.

ftp.cwd('pub/time.series/ap')     #4

this changes the directory used in the ftp site

print ftp.nlst()                  #5

this prints a list of the files in the directory

Friday, December 25, 2009

How to get today's date in YYYYMMDD format

datetime_yyyymmdd
In [1]:
#example using the time module
import time

print time.strftime('%Y%m%d') 
20141213

In [2]:
#alternatively, an example using the datetime module
import datetime

#sets the variable "today" as the datetime object.
today = datetime.datetime.today() 
In [3]:
#today is a datetime object
today 
Out[3]:
datetime.datetime(2014, 12, 13, 12, 32, 51, 292545)
In [4]:
#print the string of the datetime object in the desired format
print today.strftime('%Y%m%d') 
20141213