Free Code Snippets: web crawler

Thursday, March 7, 2013

Python Web Crawler Class

[caption id="attachment_775" align="alignnone" width="202"]

web crawler[/caption]

[sourcecode language="python"]
import sys
import re
import urllib2
import urlparse
import datetime
import os

class WebCrawler:
tocrawl = set([])
crawled = set([])
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')

def setBaseURL(self,url):
self.tocrawl = set([url])

def run(self):
url = raw_input('Enter an URL to crawl\n')
self.setBaseURL(url)
dir = raw_input('Where should I put crawled HTML source files?\n')
self.crawl(dir)

def getTitle(self,html):
startPos = html.find('<title>')
if startPos != -1:
endPos = html.find('</title>', startPos+7)
if endPos != -1:
title = html[startPos+7:endPos]
return title

def writeToFile(self,url):
with open('hyperlinks.txt', 'a') as file:
file.write(url + '\n')

def writeHTML(self,fileName,html,dirPath):
self.verifyDir(dirPath)
fileName = re.sub('[\\/:"*?<>|]',"",fileName)
with open(dirPath+'\\'+ fileName + '.txt', 'w+') as file:
file.write(html)

def verifyDir(self,path):
if not os.path.exists(path):
os.makedirs(path)

def crawl(self,dir_path):
while 1:
try:
if self.tocrawl:
crawling = self.tocrawl.pop()
print '\n\nStart Crawling - ' + crawling + '\n'
except KeyError:
raise StopIteration
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling)
except:
continue
msg = response.read()
self.writeHTML(crawling,msg,dir_path)

#Display page title
print self.getTitle(msg)

links = self.linkregex.findall(msg)
self.crawled.add(crawling)
self.writeToFile(crawling)
for link in links:
if link.startswith('mailto'):
continue
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link not in self.crawled:
print '----' + link
self.tocrawl.add(link)

[/sourcecode]

How to run this crawler?

Run following command in Python shell.


import web_crawler
w = web_crawler.WebCrawler()
w.run()

Thursday, March 7, 2013

Python Web Crawler Class

How to enable CORS in Laravel 5