from BeautifulSoup import BeautifulSoup #1
from urllib2 import urlopen #2
site = "http://sfbay.craigslist.org/rea/" #3
table = BeautifulSoup(urlopen(site)) #4
items = table('p') #5
for item in items[:-1]: #6
itempostlink = item('a')[0]['href'] #7
itemlink = site[:-5] + itempostlink #8
print itemlink #9
soup = BeautifulSoup(urlopen(itemlink)) #10
body = soup('div',{'id':"userbody"})[0].contents[0] #11
print body #12
How it works - Line by line
from BeautifulSoup import BeautifulSoup #1
from urllib2 import urlopen #2
site = "http://sfbay.craigslist.org/rea/" #3
table = BeautifulSoup(urlopen(site)) #4
items = table('p') #5
Please see my postings on how to screen scrape craigslist for details on lines #1-#5
for item in items[:-1]: #6
The last item of this resultset is disregarded as it is not a posting.
itempostlink = item('a')[0]['href'] #7
This will get the link for each item from the href tag
itemlink = site[:-5] + itempostlink #8
This appends #7 to the truncated base site link
print itemlink #9
This displays the full link to the craigslist item
soup = BeautifulSoup(urlopen(itemlink)) #10
This loads the craigslist item link into BeautifulSoup
body = soup('div',{'id':"userbody"})[0].contents[0] #11
This extracts the body contents from the item link html
print body #12
This prints the body contents
No comments:
Post a Comment