Thursday, March 7, 2013

Python Email Crawler Class

[caption id="attachment_775" align="alignnone" width="202"]Email Crawler Email Crawler[/caption]

[sourcecode language="python"]
import os
import re

class EmailCrawler:
email_list = []
emailregex = re.compile('\w+[@][a-zA-Z_\.]+\.[a-zA-Z]{2,6}')
output_path = os.getcwd()

def setBaseURL(self,url):
self.tocrawl = set([url])

def run(self):
dir = raw_input('Enter the directory path for crawled files\n')
self.verifyDir(dir)
self.crawl(dir)

def output(self):
with open("emails.txt", "w") as a:
for email in self.email_list:
a.write(str(email) + os.linesep)
print email

def verifyDir(self,path):
if not os.path.exists(path):
print "This directory does not exist"
exit

def crawl(self,dir_path):
print "Crawling Email links in " + dir_path + "....\n\n"
for path, subdirs, files in os.walk(self.get_raw_string(dir_path)):
for filename in files:
filePath = os.path.join(path, filename)
f=open(filePath, 'r')
html=f.read()
f.close()
results = self.emailregex.findall(html)
if results:
for email in results:
if email not in self.email_list:
self.email_list.append(email)
self.output()

def get_raw_string(self,text):
"""Returns a raw string representation of text"""
escape_dict={'\a':r'\a',
'\b':r'\b',
'\c':r'\c',
'\f':r'\f',
'\n':r'\n',
'\r':r'\r',
'\t':r'\t',
'\v':r'\v',
'\'':r'\'',
'\"':r'\"'}

new_string=''
for char in text:
try:
new_string += escape_dict[char]
except KeyError:
new_string += char
return new_string

[/sourcecode]

How to run this email crawler?
Load and run this program in Python Shell. (In Python shell, Run >> Run Module)
import email_crawler
e = email_crawler.EmailCrawler()
e.run()

No comments:

Post a Comment

How to enable CORS in Laravel 5

https://www.youtube.com/watch?v=PozYTvmgcVE 1. Add middleware php artisan make:middleware Cors return $next($request) ->header('Acces...