[sourcecode language="python"]
import sys
import re
import urllib2
import urlparse
import datetime
import os
class WebCrawler:
tocrawl = set([])
crawled = set([])
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
def setBaseURL(self,url):
self.tocrawl = set([url])
def run(self):
url = raw_input('Enter an URL to crawl\n')
self.setBaseURL(url)
dir = raw_input('Where should I put crawled HTML source files?\n')
self.crawl(dir)
def getTitle(self,html):
startPos = html.find('<title>')
if startPos != -1:
endPos = html.find('</title>', startPos+7)
if endPos != -1:
title = html[startPos+7:endPos]
return title
def writeToFile(self,url):
with open('hyperlinks.txt', 'a') as file:
file.write(url + '\n')
def writeHTML(self,fileName,html,dirPath):
self.verifyDir(dirPath)
fileName = re.sub('[\\/:"*?<>|]',"",fileName)
with open(dirPath+'\\'+ fileName + '.txt', 'w+') as file:
file.write(html)
def verifyDir(self,path):
if not os.path.exists(path):
os.makedirs(path)
def crawl(self,dir_path):
while 1:
try:
if self.tocrawl:
crawling = self.tocrawl.pop()
print '\n\nStart Crawling - ' + crawling + '\n'
except KeyError:
raise StopIteration
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling)
except:
continue
msg = response.read()
self.writeHTML(crawling,msg,dir_path)
#Display page title
print self.getTitle(msg)
links = self.linkregex.findall(msg)
self.crawled.add(crawling)
self.writeToFile(crawling)
for link in links:
if link.startswith('mailto'):
continue
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link not in self.crawled:
print '----' + link
self.tocrawl.add(link)
[/sourcecode]
How to run this crawler?
Run following command in Python shell.
import web_crawler
w = web_crawler.WebCrawler()
w.run()