from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen
site = "http://sfbay.craigslist.org/rea/"
table = BeautifulSoup(urlopen(site))
items = table('p')
linkdict = {}
i = 0
for item in items[:-1]:
i=i+1
itempostlink = item('a')[0]['href']
itemlink = site[:-5] + itempostlink
linkdict[i] = itemlink
for i, link in linkdict.iteritems():
print i, linkdict[i]
images = BeautifulSoup(urlopen(linkdict[i]))('img')
imagecounter = 0
for image in images:
imagecounter +=1
image_url =image['src']
print image_url
image = urlopen(image_url)
local = open("c:\\temp\\"+str(i)+"."+str(imagecounter)+".jpg",'wb')
local.write(image.read())
local.close()
Monday, December 28, 2009
How to download images from craigslist
Code
How to get BeautifulSoup to filter Craigslist HTML
Code
How it works - Line by line
Please see my postings on how to screen scrape craigslist for details on lines #1-#5
The last item of this resultset is disregarded as it is not a posting.
This will get the link for each item from the href tag
This appends #7 to the truncated base site link
This displays the full link to the craigslist item
This loads the craigslist item link into BeautifulSoup
This extracts the body contents from the item link html
This prints the body contents
from BeautifulSoup import BeautifulSoup #1
from urllib2 import urlopen #2
site = "http://sfbay.craigslist.org/rea/" #3
table = BeautifulSoup(urlopen(site)) #4
items = table('p') #5
for item in items[:-1]: #6
itempostlink = item('a')[0]['href'] #7
itemlink = site[:-5] + itempostlink #8
print itemlink #9
soup = BeautifulSoup(urlopen(itemlink)) #10
body = soup('div',{'id':"userbody"})[0].contents[0] #11
print body #12
How it works - Line by line
from BeautifulSoup import BeautifulSoup #1
from urllib2 import urlopen #2
site = "http://sfbay.craigslist.org/rea/" #3
table = BeautifulSoup(urlopen(site)) #4
items = table('p') #5
Please see my postings on how to screen scrape craigslist for details on lines #1-#5
for item in items[:-1]: #6
The last item of this resultset is disregarded as it is not a posting.
itempostlink = item('a')[0]['href'] #7
This will get the link for each item from the href tag
itemlink = site[:-5] + itempostlink #8
This appends #7 to the truncated base site link
print itemlink #9
This displays the full link to the craigslist item
soup = BeautifulSoup(urlopen(itemlink)) #10
This loads the craigslist item link into BeautifulSoup
body = soup('div',{'id':"userbody"})[0].contents[0] #11
This extracts the body contents from the item link html
print body #12
This prints the body contents
How to get the list of files in a ftp directory
Code
How this works, line by line
This imports FTP from the standard ftplib module
This defines lowercase ftp as a FTP connection to the host
This statement logs in.
this changes the directory used in the ftp site
this prints a list of the files in the directory
from ftplib import FTP #1
ftp = FTP(host='ftp.bls.gov') #2
ftp.login() #3
ftp.cwd('pub/time.series/ap') #4
print ftp.nlst() #5
How this works, line by line
from ftplib import FTP #1
This imports FTP from the standard ftplib module
ftp = FTP(host='ftp.bls.gov') #2
This defines lowercase ftp as a FTP connection to the host
ftp.login() #3
This statement logs in.
ftp.cwd('pub/time.series/ap') #4
this changes the directory used in the ftp site
print ftp.nlst() #5
this prints a list of the files in the directory
Friday, December 25, 2009
How to get today's date in YYYYMMDD format
In [1]:
#example using the time module
import time
print time.strftime('%Y%m%d')
20141213
In [2]:
#alternatively, an example using the datetime module
import datetime
#sets the variable "today" as the datetime object.
today = datetime.datetime.today()
In [3]:
#today is a datetime object
today
Out[3]:
datetime.datetime(2014, 12, 13, 12, 32, 51, 292545)
In [4]:
#print the string of the datetime object in the desired format
print today.strftime('%Y%m%d')
20141213
Subscribe to:
Posts (Atom)