import re
import sys
import urllib2
from urlparse import urljoin
from BeautifulSoup import BeautifulSoup
def searchUrl(cururl, sfile, category, begin = False):
try:
response = urllib2.urlopen(cururl)
html = response.read()
soup = BeautifulSoup(html)
emails = soup.findAll('a', href=re.compile('^mailto:'))
if len(emails) > 0:
for email in emails:
email = email['href'][email['href'].index(':')+1:email['href'].index('?')]
sfile.write(email+'\n')
count = 0
else:
links = soup.findAll('a', href=re.compile('^(/(.+/)?%s/.+)' % (category,)))
count = len(links)
if "index" in cururl:
print "Parsing %s (%d postings)" % (cururl, count)
for link in links:
searchUrl(urljoin(cururl, link['href']), sfile, category)
response.close()
return count
except ValueError:
print "Could not parse %s, skipping" % (cururl,)
#Initialization
parameterflag = False
arguments = len(sys.argv)
page = 0
begin = True
if arguments > 1:
starturl = sys.argv[1]
folder = starturl.split('/')
rcategory = folder[3]
if len(rcategory) == 0:
parameterflag = True
else:
if arguments > 2:
sfile = sys.argv[2]
else:
sfile = "emails"
else:
parameterflag = True
print "ScrapCL 0.1.2 by Karl Blessing (www.karlblessing.com)\n--------------------------------------------------\n"
if parameterflag:
print "Usage: scrapcl start_url [save_file_name]\n"
print "start_url must be a craigslist url with a preceeding category,\nresults are restricted to this category\n"
print "\thttp://city.craigslist.org/category/\n"
print "save_file_name is an optional parameter, if no name is provided\n emails will be defaulted.\n"
print "\tscrapcl http://city.craigslist.org/sys/ syse\n\twill save a file called syse.txt with the emails (one per line)"
else:
print "Starting from: %s\nSaving emails to %s.txt (this may take several minutes)" % (starturl, sfile)
outputfile = open(sfile+".txt", "a")
while page < 50000:
if page > 0:
url = "%sindex%d.html" % (starturl, page)
else:
url = "%sindex.html" % (starturl,)
begin = False
count = searchUrl(url, outputfile, rcategory, begin)
if count < 100:
break;
else:
page += 100
print "Finished Collecting emails into %s.txt" % (sfile,)
outputfile.close()