[sourcecode language="python"]
import sys
import re
import urllib2
import urlparse
import datetime
import os
class WebCrawler:
tocrawl = set([])
crawled = set([])
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
def setBaseURL(self,url):
self.tocrawl = set([url])
def run(self):
url = raw_input('Enter an URL to crawl\n')
self.setBaseURL(url)
dir = raw_input('Where should I put crawled HTML source files?\n')
self.crawl(dir)
def getTitle(self,html):
startPos = html.find('<title>')
if startPos != -1:
endPos = html.find('</title>', startPos+7)
if endPos != -1:
title = html[startPos+7:endPos]
return title
def writeToFile(self,url):
with open('hyperlinks.txt', 'a') as file:
file.write(url + '\n')
def writeHTML(self,fileName,html,dirPath):
self.verifyDir(dirPath)
fileName = re.sub('[\\/:"*?<>|]',"",fileName)
with open(dirPath+'\\'+ fileName + '.txt', 'w+') as file:
file.write(html)
def verifyDir(self,path):
if not os.path.exists(path):
os.makedirs(path)
def crawl(self,dir_path):
while 1:
try:
if self.tocrawl:
crawling = self.tocrawl.pop()
print '\n\nStart Crawling - ' + crawling + '\n'
except KeyError:
raise StopIteration
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling)
except:
continue
msg = response.read()
self.writeHTML(crawling,msg,dir_path)
#Display page title
print self.getTitle(msg)
links = self.linkregex.findall(msg)
self.crawled.add(crawling)
self.writeToFile(crawling)
for link in links:
if link.startswith('mailto'):
continue
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link not in self.crawled:
print '----' + link
self.tocrawl.add(link)
[/sourcecode]
How to run this crawler?
Run following command in Python shell.
import web_crawler
w = web_crawler.WebCrawler()
w.run()
I blog quite often and I truly appreciate your content.
ReplyDeleteThis great article has truly peaked my interest. I'm going to bookmark your website and keep checking for new details about once a week. I opted in for your RSS feed too.
Hi,
ReplyDeleteThanks for commenting. If you really need to create a nice blog you can use ready to use templates. If you 're going to host your blog yourself in Wordpress there 're no limits. You can select and upload a theme by yourself. But if you 're going to use a wordpress.com blog like this, you have few options. However in wordpress.com also there 're some types of pre-built templates. This blog uses one such template.
Anyway, I'm afraid whether you clear your doubts.
Hello, I think your blog could possibly be having internet browser compatibility issues.
ReplyDeleteWhenever I take a look at your site in Safari, it looks fine but when opening in I.E., it's got some overlapping issues.
I just wanted to give you a quick heads up!
Aside from that, excellent website!
Hi,
ReplyDeleteThanks for commenting. According to your details, I checked http://codezone4.wordpress.com in Internet Explorer including some previous versions. But I could not find any compatibility issues. So I can not agree with this fact. I'm highly confident that the site works fine even in mobile devices. Can you please mention the IE version you tested it?
Thanks a bunch ffor sharing this with all of us youu
ReplyDeleteactually recognise what yyou are speaking about! Bookmarked.
Kindly also seek advice from my website =). We can have a link exchange agreement among us