1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
|
from sys import argv from os import makedirs, unlink, sep from os.path import dirname, exists, isdir, splitext from string import replace, find, lower from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse, urljoin from formatter import DumbWriter, AbstractFormatter from cStringIO import StringIO class Downloader(object) : def __init__(self, url) : self.url = url self.file = self.filename(url) def filename(self, url, defFile = 'index.htm') : parsedUrl = urlparse(url, 'http:', 0) path = parsedUrl[1] + parsedUrl[2] ext = splitext(path) if ext[1] == '': if path[-1] == '/': path += defFile else: path += '/' + defFile localDir = dirname(path) if sep != '/': localDir = replace(localDir, '/', sep) if not isdir(localDir): if exists(localDir): unlink(localDir) makedirs(localDir) return path def download(self) : try : retval = urlretrieve(self.url, self.file) except IOError: retval = ('***ERROR: invalid URL "%s"' % self.url) return retval def parseAndGetLinks(self) : self.parser = HTMLParser(AbstractFormatter( / DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist class NetCrawler(object): count = 0 def __init__(self, url) : self.queue = [url] self.seen = [] self.dom = urlparse(url)[1] def getPage(self, url) : dl = Downloader(url) retval = dl.download() if retval[0] == '*' : print retval, '...skipping parse' return NetCrawler.count += 1 print '/n(', NetCrawler.count, ')' print 'Url: ', url print 'File: ', retval[0] self.seen.append(url) links = dl.parseAndGetLinks() for eachLink in links : if eachLink[ : 4] != 'http' and / find(eachLink, '://') == -1: eachLink = urljoin(url, eachLink) print '*',eachLink if find(lower(eachLink), 'mailto:') != -1: print '... discarded, mailto link' continue if eachLink not in self.seen: if find(eachLink, self.dom) == -1 : print '... discarded, not in domain' else : if eachLink not in self.queue : self.queue.append(eachLink) print '... new, added to queue' else : print '... dirscarded, already in queue' else : print '... discarded, already processed' def run(self) : while self.queue : url = self.queue.pop() self.getPage(url)
def main() : if len(argv) > 1 : url = argv[1] else : try : url = raw_input('Enter starting URL: ') expect (KeyboardInterrupt, EOFError) : url = '' if not url : return netCrawler = NetCrawler(url) netCrawler.run()
if __name__ == '__main__' : main()
|