Code:
import re
import sys
import urllib2
from urlparse import urljoin
from BeautifulSoup import BeautifulSoup
def searchUrl(cururl, sfile, category, begin = False):
try:
response = urllib2.urlopen(cururl)
html = response.read()
soup = BeautifulSoup(html)
emails = soup.findAll('a', href=re.compile('^mailto:'))
if len(emails) > 0:
for email in emails:
email = email['href'][email['href'].index(':')+1:email['href'].index('?')]
sfile.write(email+'\n')
count = 0
else:
links = soup.findAll('a', href=re.compile('^(/(.+/)?%s/.+)' % (category,)))
count = len(links)
if "index" in cururl:
print "Parsing %s (%d postings)" % (cururl, count)
for link in links:
searchUrl(urljoin(cururl, link['href']), sfile, category)
response.close()
return count
except ValueError:
print "Could not parse %s, skipping" % (cururl,)
#Initialization
parameterflag = False
arguments = len(sys.argv)
page = 0
begin = True
if arguments > 1:
starturl = sys.argv[1]
folder = starturl.split('/')
rcategory = folder[3]
if len(rcategory) == 0:
parameterflag = True
else:
if arguments > 2:
sfile = sys.argv[2]
else:
sfile = "emails"
else:
parameterflag = True
print "ScrapCL 0.1.2 by Karl Blessing (www.karlblessing.com)\n--------------------------------------------------\n"
if parameterflag:
print "Usage: scrapcl start_url [save_file_name]\n"
print "start_url must be a craigslist url with a preceeding category,\nresults are restricted to this category\n"
print "\thttp://city.craigslist.org/category/\n"
print "save_file_name is an optional parameter, if no name is provided\n emails will be defaulted.\n"
print "\tscrapcl http://city.craigslist.org/sys/ syse\n\twill save a file called syse.txt with the emails (one per line)"
else:
print "Starting from: %s\nSaving emails to %s.txt (this may take several minutes)" % (starturl, sfile)
outputfile = open(sfile+".txt", "a")
while page < 50000:
if page > 0:
url = "%sindex%d.html" % (starturl, page)
else:
url = "%sindex.html" % (starturl,)
begin = False
count = searchUrl(url, outputfile, rcategory, begin)
if count < 100:
break;
else:
page += 100
print "Finished Collecting emails into %s.txt" % (sfile,)
outputfile.close()
Requires Python 2.4 or above, and BeautifulSoup 3.0.* (do not try to use BeautifulSoup 3.1 with Python 2.6 or lower.)
Usage
python scrapcl.py http://city.craigslist.org/category/ emailfile
emailfile is optional, if not provided it'll default to emails.txt, if you type sys instead it'll save sys.txt. The write mode is set for append, so you could run the command again on another city, and it'll append those emails onto the list if the file already exists.