1
2
3
4 '''
5 U{Ruya<http://ruya.sourceforge.net/>} I{Arabic name meaning "sight, vision"} is a Python-based crawler for crawling English, Japanese websites.
6 B{It is targeted solely towards developers who want crawling functionality in their code}.
7 Some important features of this tool are-
8 - Extensively uses U{kconv<http://apache.noexistent.com/~mak/kconv/kconv/index_jp.html>} to convert all html content into UTF-8 encoding.
9 - Provides level-, scoped-crawling per website
10 - Supports configuration objects for crawling to suit variety of crawling requirements
11 - Can provide both gzipped, bz2 archived contents in UTF-8 encoding
12 - Can extract meta description, keywords, links, last-modified, e-tag headers from a url
13 - Supports event-based callbacks to allow caller to seamlessly integrate, and control which urls are crawled
14 - Detailed logging support to understand, evaluate exact crawling process
15
16 Example::
17 #!/usr/bin/env python
18 #-*- coding: UTF-8 -*-
19
20 import ruya
21
22 def test():
23 url= 'http://www.python.org/'
24
25 # Create a Document instance representing start url
26 doc= ruya.Document(ruya.Uri(url))
27
28 # Create a new crawler configuration object
29 cfg= ruya.Config(ruya.Config.CrawlConfig(levels= 1, crawldelay= 5), ruya.Config.RedirectConfig(), ruya.Config.LogConfig())
30
31 # Use a single-domain breadth crawler with crawler configuration
32 c= ruya.SingleDomainDelayCrawler(cfg)
33
34 # Crawler raises following events before crawling a url.
35 # Setup callbacks pointing to custom methods where we can control whether to crawl or ignore a url e.g. to ignore duplicates?
36 c.bind('beforecrawl', beforecrawl, None)
37 c.bind('aftercrawl', aftercrawl, None)
38 c.bind('includelink', includelink, None)
39
40 # Start crawling
41 c.crawl(doc)
42
43 #
44 if(None!= doc.error):
45 print(`doc.error.type`+ ': '+ `doc.error.value`)
46
47 # This callback is invoked from Ruya crawler before a url is to be included in list of urls to be crawled
48 # We can choose to ignore the url based on our custom logic
49 def includelink(caller, eventargs):
50 uri= eventargs.uri
51 level= eventargs.level
52 print 'includelink(): Include "%(uri)s" to crawl on level %(level)d?' %locals()
53
54 # Before a url is actually crawled, Ruya invokes this callback to ask whether to crawl the url or not.
55 # We can choose to ignore the url based on our custom logic
56 def beforecrawl(caller, eventargs):
57 uri= eventargs.document.uri
58 print 'beforecrawl(): "%(uri)s" is about to be crawled...' %locals()
59
60 # After a url is crawled, Ruya invokes this callback where we can check crawled values of a url.
61 def aftercrawl(caller, eventargs):
62 doc= eventargs.document
63 uri= doc.uri
64
65 print 'Url: '+ uri.url
66 print 'Title: '+ doc.title
67 print 'Description: '+ doc.description
68 print 'Keywords: '+ doc.keywords
69 print 'Last-modified: '+ doc.lastmodified
70 print 'Etag: '+ doc.etag
71
72 # Check if any errors occurred during crawl of this url
73 if(None!= doc.error):
74 print 'Error: '+ `doc.error.type`
75 print 'Value: '+ `doc.error.value`
76
77 print 'aftercrawl(): "%(uri)s" has finished crawling...' %locals()
78
79 if('__main__'== __name__):
80 # Test Ruya crawler
81 test()
82
83 For bugs, suggestions, feedback please report to the author.
84 @todo: epydoc-3.0beta1 doesn't support @rtype, @returns for property() yet?
85 '''
86
87 __author__ = 'NAIK Shantibhushan<qqbb65v59@world.ocn.ne.jp>'
88 __version__ = '1.0'
89 __date__ = '2007-May-06 1441H'
90 __copyright__ = 'Copyright (c) 2005 NAIK Shantibhushan<qqbb65v59@world.ocn.ne.jp>'
91 __license__ = 'Python'
92
93 import sys, re, string, os, time, random
94 import httplib, urllib2, urlparse, sha, kconv, gzip, bz2, htmldata
95 from os import path
96 from robotparser import RobotFileParser
97 from sgmllib import SGMLParser
98 from StringIO import StringIO
99
101 '''
102 Ruya's configuration object to determine which scope will be used for a website while crawling
103 '''
104
105
106 SCOPE_ALL= 100000
107
108 SCOPE_HOST= 100001
109
110 SCOPE_DOMAIN= 100002
111
112 SCOPE_PATH= 100003
113
114 @classmethod
116 '''
117 Checks if the scope is valid - one of allowed scopes in L{CrawlScope}
118
119 @type scope: U{number<http://docs.python.org/lib/typesnumeric.html>}
120 @param scope: A valid crawl scope.
121
122 @rtype: U{boolean<http://docs.python.org/lib/truth.html>}
123 @returns: B{True} is crawl scope is valid, B{False} otherwise.
124 '''
125 return (scope in (CrawlScope.SCOPE_HOST, CrawlScope.SCOPE_DOMAIN, CrawlScope.SCOPE_PATH))
126
128 '''
129 Ruya's L{Crawler} uses configuration objects to determine various settings during crawl.
130 It covers
131 - What options to use during crawl - L{Config.CrawlConfig}
132 - How to handle redirects while crawl - L{Config.RedirectConfig}
133 - Where to output descriptive log messages during crawl - L{Config.LogConfig}
134
135 This class simply groups them under a single class for ease of maintenance and usage.
136 It enables a developer to have different configuration profiles, and use them best suited to the requirements.
137 '''
138
139 - def __init__(self, crawlconfig= None, redirectconfig= None, logconfig= None):
140 '''
141 Constructor.
142
143 @note: Please refer to B{Instance Variables} section for details on each parameter.
144 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
145 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
146 '''
147 self.crawlconfig= crawlconfig
148 self.redirectconfig= redirectconfig
149 self.logconfig= logconfig
150
152 '''
153 Ruya's crawler configuration object stores settings that are specific during a crawl.
154 It supports all valid settings for a decent, obeying U{crawler<http://en.wikipedia.org/wiki/Web_crawler>}.
155 '''
156
157 - def __init__(self,
158 useragent= 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
159 crawlfrom= '',
160 obeyrobotstxt= True,
161 obeymetarobots= True,
162 acceptencoding= 'gzip, deflate',
163 crawldelay= 120,
164 crawlscope= CrawlScope.SCOPE_HOST,
165 allowedmimes= ['text/html'],
166 allowedextns= ['', '.htm', '.html', '.cgi', '.php', '.jsp', '.cfm', '.asp', '.aspx', '.live', '.do'],
167 levels= 2,
168 maxcontentbytes= 500000,
169 maxcontenttruncate= True,
170 maxretries= 3,
171 retrydelay= 120):
172 '''
173 Constructor.
174 Provides default values for all settings.
175
176 @note: Please refer to B{Instance Variables} section for details on each parameter.
177 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
178 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
179 '''
180 self.useragent= useragent
181 self.crawlfrom= crawlfrom
182 self.obeyrobotstxt= obeyrobotstxt
183 self.obeymetarobots= obeymetarobots
184 self.acceptencoding= acceptencoding
185 self.crawldelay= crawldelay
186
187 self.crawlscope= CrawlScope.isvalidscope(crawlscope) and crawlscope or CrawlScope.SCOPE_HOST
188 self.allowedmimes= allowedmimes
189 self.allowedextns= allowedextns
190 self.levels= levels
191 self.maxcontentbytes= maxcontentbytes
192 self.maxretries= maxretries
193 self.retrydelay= retrydelay
194
195 self.maxcontenttruncate= maxcontenttruncate
196 self.maxcontentdiscard= not maxcontenttruncate
197
199 '''
200 Ruya's redirect configuration object stores settings specific to handling redirects during a crawl.
201 '''
202 - def __init__(self, allowredirect= True, maxredirects= 10):
203 '''
204 Constructor.
205 Provides default values for all settings.
206
207 @note: Please refer to B{Instance Variables} section for details on each parameter.
208 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
209 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
210 '''
211 self.allowredirect= allowredirect
212 self.maxredirects= maxredirects
213
215 '''
216 Ruya's logging configuration object stores pointers to user-defined U{logging functions<http://docs.python.org/lib/module-logging.html>}.
217 For each different level of logging, Ruya invokes the method pointer, and outputs descriptive operation messages during a crawl.
218 The different logging functions are based on U{Python's own logging module<http://docs.python.org/lib/module-logging.html>}.
219
220 Sample log output (when using U{Python's own logging module<http://docs.python.org/lib/module-logging.html>}))::
221 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawl(): Started...
222 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawl(): Starting to crawl url "http://webryblog.biglobe.ne.jp/themeindex.html" on level 0 upto max. 2 level(s)...
223 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawlbreadth(): Started...
224 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawlbreadth(): Crawling url "http://webryblog.biglobe.ne.jp/themeindex.html" at level 0...
225 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.crawl(): Started...
226 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Started...
227 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Firing events before crawling of url "http://webryblog.biglobe.ne.jp/themeindex.html" at level 0...
228 2007-04-15 20:05:21,421 links.py 3516 2112 stderrlog 10 DEBUG SiteLinksExtractor.beforecrawl(): Setting document attributes for url "http://webryblog.biglobe.ne.jp/themeindex.html" on level 0...
229 2007-04-15 20:05:21,437 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): HEAD Crawling url "http://webryblog.biglobe.ne.jp/themeindex.html"...
230 2007-04-15 20:05:21,437 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Obeying "http://webryblog.biglobe.ne.jp/robots.txt" for url "http://webryblog.biglobe.ne.jp/themeindex.html" using "User-agent: <Your user-agent string here>"...
231 2007-04-15 20:05:21,483 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Crawling allowed for url "http://webryblog.biglobe.ne.jp/themeindex.html" using "User-agent: <Your user-agent string here>" as per rules in "http://webryblog.biglobe.ne.jp/robots.txt"...
232 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Completed. Returning (httpstatus= 200, cancel= False, ignore= False)...
233 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.aftercrawl(): Started...
234 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.aftercrawl(): 200 OK for url "http://webryblog.biglobe.ne.jp/themeindex.html"...
235 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.aftercrawl(): GET Crawling url "http://webryblog.biglobe.ne.jp/themeindex.html"...
236 ...
237 '''
238
239 - def __init__(self,
240 log= lambda msg: sys.stderr.write(msg+ '\n'),
241 debug= lambda msg: sys.stderr.write(msg+ '\n'),
242 info= lambda msg: sys.stderr.write(msg+ '\n'),
243 warning= lambda msg: sys.stderr.write(msg+ '\n'),
244 error= lambda msg: sys.stderr.write(msg+ '\n'),
245 critical= lambda msg: sys.stderr.write(msg+ '\n'),
246 exception= lambda msg: sys.stderr.write(msg+ '\n')):
247 '''
248 Constructor.
249
250 @note: Please refer to B{Instance Variables} section for details on each parameter.
251 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
252 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
253 '''
254 self.log= log
255 self.debug= debug
256 self.info= info
257 self.warning= warning
258 self.error= error
259 self.critical= critical
260 self.exception= exception
261
263 '''
264 Ruya's Uri object encapsulates an http url used while crawling.
265 It provides ready to use methods to obtain robots.txt path, domains by examining a url, and L{scope<CrawlScope>} checks on two urls.
266 '''
267
269 '''
270 Constructor.
271
272 @type url: U{str<http://docs.python.org/lib/typesseq.html>}
273 @param url: The actual url to be used for representation.
274
275 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
276 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
277 '''
278 if(None== url): url= ''
279 self.url= urlparse.urldefrag(url)[0]
280 self.hash= sha.new(url).hexdigest()
281
282 - def getDomainUrl(self):
283 '''
284 Returns the domain found after analyzing the url.
285 '''
286 u= urlparse.urlsplit(self.url)
287 ru= u.scheme+ '://'+ u.netloc+ '/'
288 return ru
289
291 '''
292 Returns the robots.txt path for a url.
293 Usually, I{http://domain.ext/} has robots.txt placed in it's root as B{I{http://domain.ext/robots.txt}}.
294 '''
295 rt= self.getDomainUrl()+ 'robots.txt'
296 return rt
297
298 - def getDomains(self):
299 '''
300 Returns valid domains found after analyzing the url.
301 I{http://www.domain.ext/} and I{http://domain.ext/} both point to the same domain B{I{domain.ext}}, so they must be considered same.
302 This function assists the crawler when determining if two urls are from L{same domain<issamedomain>}.
303 '''
304 dms= []
305
306 u= urlparse.urlsplit(self.url)
307 if(None!= u.hostname):
308 dms.extend([u.hostname])
309 if(-1!= u.hostname.find('www.')):
310 dms.extend([u.hostname.replace('www.', '')])
311
312 return dms
313
315 '''
316 Returns valid SHA hashes for url string.
317 Two different hashes will be returned if url domain starts with B{www} as I{http://www.domain.ext/} and I{http://domain.ext/} both point to the same domain B{I{domain.ext}}.
318 '''
319 hashes= [sha.new(self.url).hexdigest(), sha.new(self.url.replace('www.', '')).hexdigest()]
320 return hashes
321
323 '''
324 Returns a tuple consisting of various parts of a url.
325
326 @see: U{urlparse<http://docs.python.org/lib/module-urlparse.html>}
327 '''
328 return urlparse.urlparse(self.url)
329
330 parts= property(fget= getParts)
331 domainurl= property(fget= getDomainUrl)
332 robotstxturl= property(fget= getRobotsTxtUrl)
333 domains= property(fget= getDomains)
334 hashes= property(fget= getHashes)
335
336 - def join(self, uri):
337 '''
338 Joins two Uri objects and returns a new Uri object.
339
340 @rtype: L{Uri}
341 @returns: Joined L{Uri} instance.
342 '''
343 return Uri(urlparse.urljoin(self.url, uri.url))
344
345 - def issamedomain(self, uri):
346 '''
347 Determines of two urls belong to the same domain.
348 - I{http://B{domain.ext}/page1.htm} has same domain as I{http://B{domain.ext}/page2.htm}.
349 - I{http://B{domain.ext}/page1.htm} has same domain as I{http://B{www.domain.ext}/page2.htm} since I{http://www.domain.ext/} and I{http://domain.ext/} both point to the same domain B{I{domain.ext}}.
350
351
352 @type uri: L{Uri}.
353 @param uri: Valid instance of L{Uri} object.
354 @rtype: U{boolean<http://docs.python.org/lib/truth.html>}
355 @returns: True if urls belong to the same domain.
356 '''
357 result= False
358
359 if(None!= uri):
360
361 result= (self.domainurl.lower()== uri.domainurl.lower())
362
363 if(not result):
364
365 domains= [domain for domain in uri.domains if domain in self.domains]
366 result= (1<= len(domains))
367
368 return result
369
371 '''
372 Determines if two urls belong to the same host
373 - I{http://B{domain.ext}/page1.htm} has same host (domain) as I{http://B{domain.ext}/page2.htm}
374 - I{http://B{domain.ext}/page1.htm} does not have same host (domain) as I{http://B{otherdomain.ext}/page2.htm}
375 @see: U{issamedomain}
376
377 @type uri: L{Uri}.
378 @param uri: Valid instance of L{Uri} object.
379 @rtype: U{boolean<http://docs.python.org/lib/truth.html>}
380 @returns: True if urls belong to the same domain (host).
381 '''
382 return self.issamedomain(uri)
383
384 - def isdomainscope(self, uri):
385 '''
386 Determines two urls have same domain or either of the urls comes from a sub-domain of the other url.
387 - I{http://B{domain.ext}/page1.htm} comes from the same domain as I{http://B{domain.ext}/page2.htm}
388 - I{http://B{example.domain.ext}/page1.htm} comes from a sub-domain as I{http://B{domain.ext}/page2.htm}.
389 B{example.domain.ext} is a sub-domain of B{domain.ext}.
390 - I{http://B{domain.ext}/page1.htm} does not come from same domain, or sub-domain as I{http://B{otherdomain.ext}/page2.htm}
391
392 @note: Sub-domain is simply determined if B{example.domain.ext} I{ends} in B{domain.ext}.
393 @rtype: U{boolean<http://docs.python.org/lib/truth.html>}
394 @returns: True if urls belong to the same domain or either of the urls comes from a sub-domain of the other url.
395 '''
396 result= False
397
398 if(None!= uri):
399
400 d1= self.parts.hostname.replace('www', '')
401 d2= d1.split('.')
402
403 basedomain= d1
404
405
406
407 otherdomain= uri.parts.hostname and uri.parts.hostname or ''
408
409
410 result= otherdomain.endswith(basedomain)
411
412 return result
413
415 '''
416 Determines if two urls belong to the same folder.
417 - I{http://B{domain.ext/support}/page1.htm} belongs to the same folder B{support} as I{http://B{domain.ext/support}/page2.htm}
418 - I{http://B{domain.ext}/index.htm} does not belong to same folder B{support} as I{http://B{domain.ext/support}/page2.htm}
419
420 @type uri: L{Uri}.
421 @param uri: Valid instance of L{Uri} object.
422 @rtype: U{boolean<http://docs.python.org/lib/truth.html>}
423 @returns: True if urls belong to the same folder.
424 '''
425 result= False
426
427 if(None!= result):
428
429 basepath= self.parts.netloc+ os.path.split(self.parts.path)[0]
430
431 otherpath= uri.parts.netloc+ uri.parts.path
432
433
434 result= otherpath.startswith(basepath)
435
436 return result
437
439 '''
440 String representation of the url.
441
442 @rtype: U{str<http://docs.python.org/lib/typesseq.html>}
443 @returns: String representation of the url.
444 '''
445 return self.url
446
447 __repr__= __str__
448
449
451 '''
452 Determines if two urls are identical by comparing their SHA hashes
453
454 @type uri: L{Uri}.
455 @param uri: Valid instance of L{Uri} object.
456 @rtype: U{boolean<http://docs.python.org/lib/truth.html>}
457 @returns: True if urls are identical, False otherwise.
458 '''
459 if(None== uri):
460 return False
461 else:
462 return (uri.hash== self.hash)
463
465 '''
466 Determines if two urls are not identical by comparing their SHA hashes
467
468 @type uri: L{Uri}.
469 @param uri: Valid instance of L{Uri} object.
470 @rtype: U{boolean<http://docs.python.org/lib/truth.html>}
471 @returns: True if urls are not identical, False otherwise.
472 '''
473 if(None== uri):
474 return True
475 else:
476 return (uri.hash!= self.hash)
477
479 '''
480 Ruya's document object represents an html document.
481 It provides ready to use access to document's http headers and various other properties such as title, keywords etc.
482 It also allows to access plain html contents as gzipped, or bz2 archived.
483 '''
484
485 - def __init__(self, uri, lastmodified= '', etag= ''):
486 '''
487 Constructor.
488
489 @type uri: L{Uri}.
490 @param uri: Valid instance of L{Uri} object.
491 @type lastmodified: U{str<http://docs.python.org/lib/typesseq.html>}
492 @param lastmodified: I{Last-modified} header value obtained from last crawl, if any.
493 @type etag: U{str<http://docs.python.org/lib/typesseq.html>}
494 @param etag: I{Etag} header value obtained from last crawl, if any.
495
496 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
497 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
498 '''
499 self.headers= {}
500 self._uri= uri
501 self.title= ''
502 self.description= ''
503 self.keywords= ''
504 self.lastmodified= lastmodified
505 self.etag= etag
506 self.httpstatus= 200
507 self.httpreason= ''
508 self.contenttype= ''
509 self.contentencoding= ''
510 self._zippedcontent= ''
511 self._isZipped= False
512 self._bzippedcontent= ''
513 self._isBzipped= False
514 self._plaincontent= ''
515 self.links= []
516 self.redirecturi= None
517 self.redirects= 0
518 self.redirecturis= []
519 self.error= None
520
521
522 self._cleandata= re.compile('(?msix)[\r\n\f\v]+')
523
525 '''
526 Returns the url for this document.
527 '''
528 return self._uri
529
530 uri= property(fget= getUri)
531
533 '''
534 Returns all links from this document converted to absolute links with reference to document's L(uri).
535 '''
536 nlinks= []
537 for link in self.links:
538 nlinks.extend([self.uri.join(link)])
539
540 return nlinks
541
542 normalizedlinks= property(fget= getNormalizedLinks)
543
545 '''
546 Returns gzipped content for this document.
547 @note: The content is gzipped with the maximum compression level of 9.
548 @see: U{gzip<http://docs.python.org/lib/module-gzip.html>}
549 '''
550 zc= ''
551
552 if(self._isZipped):
553
554 zc= self._zippedcontent
555
556 else:
557 if(0!= len(self.plaincontent)):
558 try:
559 pc= StringIO()
560 pcz= gzip.GzipFile(None, 'wb', 9, pc)
561 pcz.write(self.plaincontent)
562 pcz.close()
563 zc= pc.getvalue()
564
565
566 self._zippedcontent= zc
567 self._isZipped= True
568
569 except:
570 zc= ''
571
572 return zc
573
574 - def setZippedContent(self, data):
575 '''
576 Sets the gzipped content for the document.
577 @note: The content is unzipped assuming the compression level of 9.
578 @see: U{gzip<http://docs.python.org/lib/module-gzip.html>}
579 '''
580 if(None== data): data= ''
581
582 self.plaincontent= ''
583 self._zippedcontent= ''
584
585 if(0!= len(data)):
586 try:
587 pcz= StringIO(data)
588 zc= gzip.GzipFile(None, 'rb', 9, pcz)
589 self.plaincontent= zc.read()
590 zc.close()
591
592
593 self._zippedcontent= data
594 self._isZipped= True
595
596 except:
597 pass
598
599 zippedcontent= property(fget= getZippedContent, fset= setZippedContent)
600
602 '''
603 Return bz2 archived contents for this document.
604 @note: The content is bz2 archived with the maximum compression level of 9.
605 @see: U{bz2<http://docs.python.org/lib/module-bz2.html>}
606 '''
607 zc= ''
608
609 if(self._isBzipped):
610
611 zc= self._bzippedcontent
612
613 else:
614 if(0!= len(self.plaincontent)):
615 try:
616 zc= bz2.compress(self.plaincontent, 9)
617
618
619 self._bzippedcontent= zc
620 self._isBzipped= True
621
622 except:
623 zc= ''
624
625 return zc
626
627 - def setBzippedContent(self, data):
628 '''
629 Sets the bz2 archived contents for this document.
630 @see: U{bz2<http://docs.python.org/lib/module-bz2.html>}
631 '''
632 if(None== data): data= ''
633
634 self.plaincontent= ''
635 self._bzippedcontent= ''
636
637 if(0!= len(data)):
638 try:
639 self.plaincontent= bz2.uncompress(data)
640
641
642 self._bzippedcontent= data
643 self._isBzipped= True
644
645 except:
646 pass
647
648 bzippedcontent= property(fget= getBzippedContent, fset= setBzippedContent)
649
650 - def getPlainContent(self):
651 '''
652 Returns the plain html content for this document.
653 '''
654 return self._plaincontent
655
656 - def setPlainContent(self, data):
657 '''
658 Sets the plain html content for this document.
659 @note: Empty lines are removed from the plain contents.
660 '''
661 if(None== data):
662 data= ''
663 else:
664
665 data= self._cleandata.sub('', data)
666
667 self._plaincontent= data
668 self._isZipped, self._isBzipped= False, False
669
670 plaincontent= property(fget= getPlainContent, fset= setPlainContent)
671
672 - def getContentHash(self):
673 '''
674 Returns the SHA hash for plain contents of this document.
675 '''
676 cc= self.plaincontent
677 cchash= ''
678
679 if(0!= len(cc)):
680 cchash= sha.new(cc).hexdigest()
681
682 return cchash
683
684 hash= property(fget= getContentHash)
685
687 '''
688 Ruya's document error object represents crawl error occurred during crawl of a L{Document}.
689 '''
690 DOCERR_INTERNAL= 100000
691
692 DOCERR_CONVERSION= 100001
693 DOCERR_MAXREDIRECT= 100002
694 DOCERR_NO_REDIRECTALLOW= 100003
695 DOCERR_URLERROR= 100004
696 DOCERR_UNHANDLED_HTTPSTATUS= 100005
697 DOCERR_CRAWL_NOTALLOW= 100006
698 DOCERR_INVALID_CRAWLSCOPE= 100007
699 DOCERR_INVALID_MIME= 100009
700 DOCERR_NOHTML= 100010
701
703 '''
704 Constructor.
705
706 @note: Please refer to B{Instance Variables} section for details on each parameter.
707
708 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
709 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
710 '''
711 self.code= code
712 self.type= type
713 self.value= value
714
716 '''
717 String representation of this object.
718
719 @rtype: U{str<http://docs.python.org/lib/typesseq.html>}
720 @returns: String representation of the url.
721 '''
722 return `self.code`+ ' '+ `self.type`+ ': '+ `self.value`
723
724 __repr__= __str__
725
727 '''
728 Ruya's main object is the Crawler object.
729 This object uses L{configuration<Config>} settings, and performs a crawl on given L{url<Uri>}.
730 Developers can extend Ruya's Crawler and create more sophisticated crawlers similar to Ruya's L{SingleDomainDelayCrawler}.
731 '''
733 '''
734 Constructor.
735
736 @type config: L{Config}.
737 @param config: The L{configuration<Config>} object to be used while crawling.
738
739 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
740 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
741 '''
742 self.config= config
743
744 self.kc= kconv.Kconv(outcode= kconv.UTF8, incode= kconv.AUTO, hankanaconvert= kconv.HANKAKU, checkmode= kconv.TABLE2, mode= kconv.LINE)
745
746
747
748 self.callbacks= {'beforecrawl': [],
749 'aftercrawl': [],
750 'includelink': []}
751
752 - def bind(self, event, eventhandler, addnleventargs):
753 '''
754 Binds eventhandler, callback (pointer to a function) to one of Ruya's events.
755 Example::
756 crawlerobj.bind('beforecrawl', myfunction, None)
757 ...
758
759 def myfunction(caller, eventargs):
760 ...
761
762 @note: The eventhandler should have signature as B{func(caller, eventargs)}
763 @type event: U{str<http://docs.python.org/lib/typesseq.html>}.
764 @param event: Must be one of the following values- I{beforecrawl}, I{aftercrawl}, I{includelink}.
765 @type eventhandler: U{function<http://docs.python.org/lib/typesfunctions.html>}
766 @param eventhandler: User-defined function having function signature as B{function(caller, eventargs)}
767 @type addnleventargs: U{list<http://docs.python.org/lib/typesseq.html>}.
768 @param addnleventargs: Additional L{event arguments<Crawler.EventArgs>} to be passed when calling eventhandler.
769 @see: L{callbacks}
770
771 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
772 @return: U{None<http://docs.python.org/lib/bltin-null-object.html>}.
773 '''
774 self.callbacks[event].append([eventhandler, addnleventargs])
775
777 '''
778 Fires eventhandlers, callbacks (pointer to a function) for one of Ruya's events.
779
780 @type events: U{list<http://docs.python.org/lib/typesseq.html>}
781 @param events: List of callbacks (eventhandlers) to invoke.
782 @type eargs: L{Crawler.EventArgs}.
783 @param eargs: Additional L{event arguments<Crawler.EventArgs>} to pass while invoking event handlers.
784 @see: L{bind}
785 @note: While invoking multiple event-handlers sequentially, if any of the event-handlers sets L{ignore<Crawler.EventArgs.ignore>} to True, it is remembered, and cannot be reset by any event handler in the chain.
786
787 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>}
788 @return: (cancel, ignore) values set either internally or explicity by event handlers.
789 '''
790 cancel, ignore= False, False
791
792 for (event, eventargs) in events:
793 ignore2, cancel= False, False
794 eargs.cancel, eargs.args= False, eventargs
795 event(self, eargs)
796
797
798 ignore2, cancel= eargs.ignore, eargs.cancel
799 if(cancel): break
800
801
802
803 if(ignore2): ignore= True
804
805 return (cancel, ignore)
806
808 '''
809 Performs a U{HEAD<http://www.w3.org/Protocols/rfc2616/rfc2616-sec9.html>} crawl on L{Document}'s url.
810 The L{beforecrawl<callbacks>} events are fired before a url is crawled.
811 It uses headers from L{Document} instance, and uses L{robots.txt<Config.CrawlConfig.obeyrobotstxt>} rules while crawling if allowed.
812
813 @note: As redirects are also handled, the L{beforecrawl<callbacks>} event can be fired multiple times if a url is redirected to another url.
814 @type document: L{Document}.
815 @param document: Valid instance of L{Document} object.
816 @type level: number.
817 @param level: The current level of the document being crawled.
818
819 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>}
820 @return: (cancel, ignore) values set either internally or explicity by event handlers.
821 '''
822 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig
823 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception
824
825 debug('Crawler.beforecrawl(): Started...' %locals())
826
827 uri, doc, maxcontentbytes, truncate, discard= \
828 (None== document.redirecturi) and document.uri or document.redirecturi, document, cc.maxcontentbytes, cc.maxcontenttruncate, cc.maxcontentdiscard
829
830 httpstatus, httpreason, headers, cancel, ignore= 0, '', {}, False, False
831 debug('Crawler.beforecrawl(): Firing events before crawling of url "%(uri)s" at level %(level)d...' %locals())
832
833 cancel, ignore= self.firevents(self.callbacks['beforecrawl'], Crawler.CrawlEventArgs(level, doc, None))
834
835 if(cancel):
836 critical('Crawler.beforecrawl(): Cancelling crawl at url "%(uri)s" level %(level)d as one of the event-handlers requested to cancel...' %locals())
837
838 elif(ignore):
839 warning('Crawler.beforecrawl(): Ignoring url "%(uri)s" level %(level)d as one of the event-handlers requested to ignore...' %locals())
840
841 else:
842 cancel, ignore= False, False
843 httpcon, response= None, None
844 crawl_allowed= True
845
846 debug('Crawler.beforecrawl(): HEAD Crawling url "%(uri)s"...' %locals())
847 defhdrs= {
848 'User-agent': cc.useragent,
849 'From': cc.crawlfrom,
850 'Accept-encoding': cc.acceptencoding,
851 'Accept-charset': 'utf-8',
852 'Accept': ','.join(cc.allowedmimes),
853 'If-none-match': doc.etag,
854 'If-modified-since': doc.lastmodified,
855 'Range': '0-%(maxcontentbytes)d' %locals()
856 }
857
858 if(cc.obeyrobotstxt):
859 cancel, ignore= False, False
860 robotstxt, useragent= uri.robotstxturl, cc.useragent
861 debug('Crawler.beforecrawl(): Obeying "%(robotstxt)s" for url "%(uri)s" using "User-agent: %(useragent)s"...' %locals())
862
863 rp= RobotFileParser()
864 rp.set_url(robotstxt)
865 rp.read()
866
867 crawl_allowed= rp.can_fetch(useragent, uri.parts.path)
868
869 allow= crawl_allowed and 'allowed' or 'not allowed'
870 cancel= not crawl_allowed
871
872 debug('Crawler.beforecrawl(): Crawling %(allow)s for url "%(uri)s" using "User-agent: %(useragent)s" as per rules in "%(robotstxt)s"...' %locals())
873
874 else:
875 debug('Crawler.beforecrawl(): Ignoring "%(robotstxt)s" for url "%(uri)s"...' %locals())
876
877 if(crawl_allowed):
878 cancel, ignore= False, False
879
880
881 errhttpstatuses= [httplib.REQUESTED_RANGE_NOT_SATISFIABLE, httplib.NOT_ACCEPTABLE]
882
883 httpstatus= errhttpstatuses[random.randint(0, -1+ len(errhttpstatuses))]
884
885 while(httpstatus in errhttpstatuses):
886 retry, maxretries, retrydelay= 1, cc.maxretries, cc.retrydelay
887
888 while(1<= retry<= maxretries):
889 try:
890 httpcon= httplib.HTTPConnection(uri.parts.netloc)
891 httpcon.request('HEAD', uri.parts.path, None, defhdrs)
892
893 response= httpcon.getresponse()
894
895 httpstatus, httpreason= response.status, response.reason
896 headers.clear(), headers.update(response.getheaders())
897
898
899 if(httplib.REQUESTED_RANGE_NOT_SATISFIABLE== httpstatus):
900 reqrange= defhdrs['Range']
901 debug('Crawler.beforecrawl(): %(httpstatus)d %(httpreason)s range "%(reqrange)s" bytes for url "%(uri)s". Retrying again without "Range"..' %locals())
902 del defhdrs['Range']
903 response.close(), httpcon.close()
904
905
906 if(httplib.NOT_ACCEPTABLE== httpstatus):
907 reqcharset= defhdrs['Accept-charset']
908 debug('Crawler.beforecrawl(): %(httpstatus)d %(httpreason)s charset "%(reqcharset)s" for url "%(uri)s". Retrying again without "Accept-charset"..' %locals())
909 del defhdrs['Accept-charset']
910 response.close(), httpcon.close()
911
912 except:
913 response.close(), httpcon.close()
914
915 if(retry== maxretries):
916 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT, sys.exc_info()[0], sys.exc_info()[1])
917
918 else:
919 exception('Crawler.beforecrawl(): Retrying again after %(retrydelay)d seconds till max. %(maxretries)d retries. %(retry)d retry(ies) till now...' %locals())
920 retry+= 1
921 time.sleep(retrydelay)
922
923 else:
924 break
925
926
927 response.close(), httpcon.close()
928
929
930 if(httpstatus in (httplib.MOVED_PERMANENTLY, httplib.TEMPORARY_REDIRECT, httplib.FOUND)):
931 cancel, ignore= False, False
932
933 rediruri= Uri(headers.get('location', ''))
934
935 debug('Crawler.beforecrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s" to "%(rediruri)s"...' %locals())
936
937
938 redirects= abs(doc.redirects)
939 maxredirects= rc.maxredirects
940
941 if(rc.allowredirect):
942 cancel, ignore= False, False
943 debug('Crawler.beforecrawl(): Auto-redirect url "%(uri)s" to "%(rediruri)s"...' %locals())
944
945 if(0<= redirects <= maxredirects):
946 redirects+= 1
947 doc.redirects= redirects
948 doc.redirecturi= rediruri
949 doc.redirecturis.append((doc.redirects, httpstatus, rediruri))
950
951
952 cancel, ignore= self.beforecrawl(doc, level)
953
954
955 debug('Crawler.beforecrawl(): Completed. Returning (httpstatus= %(httpstatus)d, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals())
956 return (cancel, ignore)
957
958 else:
959 cancel= True
960 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT)
961
962 debug('Crawler.beforecrawl(): Max. auto-redirect url "%(uri)s" to "%(rediruri)s" exceeded %(maxredirects)d max. redirects. Will not crawl further...' %locals())
963
964 else:
965 cancel= True
966 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_NO_REDIRECTALLOW)
967
968 debug('Crawler.beforecrawl(): Will not auto-redirect url "%(uri)s" to "%(rediruri)s" as allowredirect= False...' %locals())
969
970 else:
971
972 pass
973
974 else:
975 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_CRAWL_NOTALLOW)
976 warning('Crawler.beforecrawl(): Not crawling url "%(uri)s"...' %locals())
977
978
979 doc.httpstatus, doc.httpreason= httpstatus, httpreason
980 doc.headers.clear(), doc.headers.update(headers)
981
982 debug('Crawler.beforecrawl(): Completed. Returning (httpstatus= %(httpstatus)d, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals())
983 return (cancel, ignore)
984
986 '''
987 Performs a U{GET<http://www.w3.org/Protocols/rfc2616/rfc2616-sec9.html>} crawl on L{Document}'s url.
988 The L{aftercrawl<callbacks>} events are fired after a url is crawled L{successfully<Document.error>}.
989 The attributes for L{Document} object are extracted in this method.
990 The L{CrawlScope} is considered before including any L{links<Document.links>} for the L{Document}.
991
992 @type document: L{Document}.
993 @param document: Valid instance of L{Document} object.
994 @type level: number.
995 @param level: The current level of the document being crawled.
996
997 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>}
998 @return: (cancel, ignore) values set either internally or explicity by event handlers.
999 '''
1000 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig
1001 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception
1002
1003 debug('Crawler.aftercrawl(): Started...' %locals())
1004
1005 uri, doc, maxcontentbytes, truncate, discard= \
1006 (None== document.redirecturi) and document.uri or document.redirecturi, document, cc.maxcontentbytes, cc.maxcontenttruncate, cc.maxcontentdiscard
1007
1008
1009 httpstatus, httpreason, headers, cancel, ignore= doc.httpstatus, doc.httpreason, doc.headers, False, False
1010
1011 if(httpstatus in (httplib.OK, httplib.NOT_MODIFIED)):
1012 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals())
1013
1014 cancel, ignore= False, False
1015 httpcon, response= None, None
1016 html= ''
1017
1018 if(httplib.NOT_MODIFIED== httpstatus):
1019 debug('Crawler.aftercrawl(): Using previously crawled body (if either document.plaincontent or document.zippedcontent were set)...' %locals())
1020 html= doc.plaincontent
1021
1022 elif(httplib.OK== httpstatus):
1023 debug('Crawler.aftercrawl(): GET Crawling url "%(uri)s"...' %locals())
1024 defhdrs= {
1025 'User-agent': cc.useragent,
1026 'From': cc.crawlfrom,
1027 'Accept-encoding': cc.acceptencoding,
1028 'Accept-charset': 'utf-8',
1029 'Accept': ','.join(cc.allowedmimes),
1030 'If-none-match': doc.etag,
1031 'If-modified-since': doc.lastmodified,
1032 'Range': '0-%(maxcontentbytes)d' %locals()
1033 }
1034
1035
1036 errhttpstatuses= [httplib.REQUESTED_RANGE_NOT_SATISFIABLE, httplib.NOT_ACCEPTABLE]
1037
1038 httpstatus= errhttpstatuses[random.randint(0, -1+ len(errhttpstatuses))]
1039
1040 while(httpstatus in errhttpstatuses):
1041 retry, maxretries, retrydelay= 1, cc.maxretries, cc.retrydelay
1042
1043 while(1<= retry<= maxretries):
1044 try:
1045 httpcon= httplib.HTTPConnection(uri.parts.netloc)
1046 httpcon.request('GET', uri.parts.path, None, defhdrs)
1047
1048 response= httpcon.getresponse()
1049
1050 httpstatus, httpreason= response.status, response.reason
1051 headers.clear(), headers.update(response.getheaders())
1052
1053
1054 if(httplib.REQUESTED_RANGE_NOT_SATISFIABLE== httpstatus):
1055 reqrange= defhdrs['Range']
1056 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s range "%(reqrange)s" bytes for url "%(uri)s". Retrying again without "Range"..' %locals())
1057 del defhdrs['Range']
1058 response.close(), httpcon.close()
1059
1060
1061 if(httplib.NOT_ACCEPTABLE== httpstatus):
1062 reqcharset= defhdrs['Accept-charset']
1063 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s charset "%(reqcharset)s" for url "%(uri)s". Retrying again without "Accept-charset"..' %locals())
1064 del defhdrs['Accept-charset']
1065 response.close(), httpcon.close()
1066
1067 except:
1068 response.close(), httpcon.close()
1069
1070 if(retry== maxretries):
1071 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT, sys.exc_info()[0], sys.exc_info()[1])
1072
1073 else:
1074 exception('Crawler.aftercrawl(): Retrying again after %(retrydelay)d seconds till max. %(maxretries)d retries. %(retry)d retry(ies) till now...' %locals())
1075 retry+= 1
1076 time.sleep(retrydelay)
1077
1078 else:
1079 break
1080
1081
1082 if(httplib.OK!= httpstatus):
1083
1084 response.close(), httpcon.close()
1085
1086
1087 if(httplib.OK== httpstatus):
1088 cancel, ignore= False, False
1089 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals())
1090
1091 html= ''
1092
1093 contenttype= headers.get('content-type', '')
1094 doc.contenttype= contenttype.lower()
1095
1096 debug('Crawler.aftercrawl(): Checking content-type for url "%(uri)s"...' %locals())
1097
1098
1099 availablemimes= [v for v in cc.allowedmimes if (-1!= contenttype.find(v))]
1100
1101 if(1== len(availablemimes)):
1102 debug('Crawler.aftercrawl(): Allowed content-type for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals())
1103
1104 totalbytes= int(headers.get('content-length', maxcontentbytes))
1105 bytestodownload= 0
1106
1107 debug('Crawler.aftercrawl(): Total %(totalbytes)d bytes to be downloaded for url "%(uri)s"...' %locals())
1108
1109 if(0>= totalbytes):
1110 bytestodownload= maxcontentbytes
1111
1112 elif(1<= totalbytes<= maxcontentbytes):
1113 bytestodownload= totalbytes
1114
1115 elif((totalbytes> maxcontentbytes) and truncate):
1116 bytestotruncate= abs(totalbytes- maxcontentbytes)
1117 bytestodownload= maxcontentbytes
1118 warning('Crawler.aftercrawl(): Truncating %(bytestotruncate)d bytes from total download size of %(totalbytes)d bytes as it exceeds max. download size limit of %(maxcontentbytes)d bytes allowed to be downloaded for url "%(uri)s" (Check if "maxcontenttruncate" is "True" in crawler configuration)...' %locals())
1119
1120 else:
1121 bytestodownload= 0
1122 warning('Crawler.aftercrawl(): Discarding as total download size of %(totalbytes)d bytes exceeds max. download size limit of %(maxcontentbytes)d bytes allowed to be downloaded for url "%(uri)s". Check if "maxcontenttruncate" is "False" in crawler configuration...' %locals())
1123
1124 if(0== bytestodownload):
1125 response.close(), httpcon.close()
1126
1127 else:
1128 debug('Crawler.aftercrawl(): Downloading content from url "%(uri)s"...' %locals())
1129
1130 debug('Crawler.aftercrawl(): Updating headers for document for url "%(uri)s"...' %locals())
1131 doc.lastmodified= headers.get('last-modified', '')
1132 doc.etag= headers.get('etag', '')
1133
1134 debug('Crawler.aftercrawl(): Reading total %(bytestodownload)d bytes from url "%(uri)s"...' %locals())
1135
1136 html= response.read(bytestodownload)
1137 response.close(), httpcon.close()
1138
1139
1140 contentencoding= headers.get('content-encoding', '')
1141 doc.contentencoding= contentencoding.lower()
1142
1143 if('gzip'== contentencoding.lower()):
1144 debug('Crawler.aftercrawl(): Contents are gzipped/compressed for url "%(uri)s". Unzipping/uncompressing contents...' %locals())
1145 doc.zippedcontent= html
1146
1147 else:
1148
1149 doc.plaincontent= html
1150
1151
1152 html= doc.plaincontent
1153
1154 else:
1155 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_INVALID_MIME)
1156 warning('Crawler.aftercrawl(): Non-allowed content-type for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals())
1157
1158
1159 if(0!= len(html)):
1160 if(kconv.UTF8!= kconv.ChkCoding(html)):
1161 debug('Crawler.aftercrawl(): Converting contents to utf-8 for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals())
1162
1163 try:
1164 doc.plaincontent= self.kc.convert(html)
1165 html= doc.plaincontent
1166
1167 except:
1168 html= ''
1169 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_CONVERSION, sys.exc_info()[0], sys.exc_info()[1])
1170
1171 exception('Crawler.aftercrawl(): Failed converting contents to utf-8 for url "%(uri)s"...' %locals())
1172
1173 else:
1174 debug('Crawler.aftercrawl(): Contents already in utf-8 for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals())
1175
1176 else:
1177 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_UNHANDLED_HTTPSTATUS)
1178 critical('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals())
1179
1180
1181 if(0!= len(html)):
1182
1183 debug('Crawler.aftercrawl(): Parsing contents for url "%(uri)s"...' %locals())
1184 tags= htmldata.tagextract(html)
1185
1186 extitlestart, extitleend, title, = False, False, ''
1187 description, keywords, metarobots= '', '', ''
1188
1189
1190 for tag in tags:
1191
1192 if(isinstance(tag, tuple) and ('title'== tag[0].lower())):
1193 extitlestart= True
1194 extitleend= False
1195
1196 else:
1197 if(extitlestart and not extitleend and isinstance(tag, str)):
1198 title+= tag
1199
1200 if(isinstance(tag, tuple) and ('/title'== tag[0].lower())):
1201 extitlestart= True
1202 extitleend= True
1203
1204
1205 if(isinstance(tag, tuple) and ('meta/'== tag[0])):
1206 if('description'== tag[1].get('name', '').lower()):
1207 description= tag[1].get('content', '').lower()
1208
1209 if('keywords'== tag[1].get('name', '').lower()):
1210 keywords= tag[1].get('content', '').lower()
1211
1212 if('robots'== tag[1].get('name', '').lower()):
1213 metarobots= tag[1].get('content', '').lower()
1214
1215 noindex, nofollow = False, False
1216
1217 if(cc.obeymetarobots):
1218 debug('Crawler.aftercrawl(): Obeying meta robots for url "%(uri)s"...' %locals())
1219
1220
1221 if(0== len(metarobots)):
1222 debug('Crawler.aftercrawl(): No meta robots found for url "%(uri)s". Crawling normally...' %locals())
1223
1224 else:
1225 debug('Crawler.aftercrawl(): Using meta robots "%(metarobots)s" found for url "%(uri)s"...' %locals())
1226 noindex= (-1!= metarobots.find('noindex'))
1227 nofollow= (-1!= metarobots.find('nofollow'))
1228
1229 else:
1230 debug('Crawler.aftercrawl(): Ignoring meta robots for url "%(uri)s"...' %locals())
1231
1232 debug('Crawler.aftercrawl(): Setting document attributes for url "%(uri)s"...' %locals())
1233
1234 if(noindex):
1235 warning('Crawler.aftercrawl(): According to meta robots "noindex" is specified for url "%(uri)s". No data will be used from this url. However links may be crawled if "follow" is specified...' %locals())
1236
1237 else:
1238 doc.title= title
1239 doc.description= description
1240
1241 if(nofollow):
1242 warning('Crawler.aftercrawl(): According to meta robots "nofollow" is specified for url "%(uri)s". No links will be used from this url. However, data may be used if "index" is specified...' %locals())
1243
1244 else:
1245 cancel, ignore= False, False
1246
1247 debug('Crawler.aftercrawl(): Extracting all links from "%(uri)s"...' %locals())
1248
1249
1250 linkmatches= htmldata.urlextract(html, uri.url)
1251 links= [Uri(linkmatch.url) for linkmatch in linkmatches]
1252
1253 totallinks= len(links)
1254 debug('Crawler.aftercrawl(): Filtering %(totallinks)d links from "%(uri)s" having valid crawl extensions (see "allowedextns" in crawler config)...' %locals())
1255 links= [link for link in links
1256 if os.path.splitext(link.parts.path)[1] in cc.allowedextns]
1257
1258 filterlinks1= len(links)
1259 debug('Crawler.aftercrawl(): Filtered %(filterlinks1)d links from "%(uri)s". Filtering further those satisfying requested crawl scope...' %locals())
1260
1261 crawllinks= []
1262 for link in links:
1263 if(CrawlScope.SCOPE_HOST== cc.crawlscope):
1264 if(uri.ishostscope(link)):
1265 if(link not in crawllinks):
1266 crawllinks.append(link)
1267
1268 elif(CrawlScope.SCOPE_DOMAIN== cc.crawlscope):
1269 if(uri.isdomainscope(link)):
1270 if(link not in crawllinks):
1271 crawllinks.append(link)
1272
1273 elif(CrawlScope.SCOPE_PATH== cc.crawlscope):
1274 if(uri.ispathscope(link)):
1275 if(link not in crawllinks):
1276 crawllinks.append(link)
1277
1278 else:
1279 cancel= True
1280 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_INVALID_CRAWLSCOPE)
1281
1282 critical('Crawler.aftercrawl(): Unknown crawl scope for url "%(uri)s". Please check crawler\'s crawls cope for valid scopes...' %locals())
1283 break
1284
1285 if(cancel):
1286 pass
1287
1288 else:
1289 doc.links.extend(crawllinks)
1290
1291 filterlinks2= len(crawllinks)
1292 ignorelinks= abs(totallinks- filterlinks2)
1293
1294 debug('Crawler.aftercrawl(): Found %(filterlinks2)d links from "%(uri)s" to crawl...' %locals())
1295 debug('Crawler.aftercrawl(): Total links= %(totallinks)d, Links to crawl= %(filterlinks2)d, Links ignored= %(ignorelinks)d from "%(uri)s"...' %locals())
1296
1297 else:
1298 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_NOHTML)
1299 warning('Crawler.aftercrawl(): No html body available for url "%(uri)s"...' %locals())
1300
1301 else:
1302 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_UNHANDLED_HTTPSTATUS)
1303 critical('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals())
1304
1305
1306 doc.httpstatus, doc.httpreason= httpstatus, httpreason
1307 doc.headers.clear(), doc.headers.update(headers)
1308
1309 if(cancel):
1310 pass
1311
1312 else:
1313 debug('Crawler.aftercrawl(): Firing events after crawling of url "%(uri)s" at level %(level)d...' %locals())
1314 cancel, ignore= False, False
1315 cancel, ignore= self.firevents(self.callbacks['aftercrawl'], Crawler.CrawlEventArgs(level, doc, None))
1316
1317 debug('Crawler.aftercrawl(): Completed. Returning (httpstatus= %(httpstatus)d, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals())
1318 return (cancel, ignore)
1319
1321 '''
1322 Ruya's L{Crawler} provides event-based callback mechanism during crawl to allow clients to have more control over which urls are crawled.
1323 The events use this object for event communication.
1324
1325 Example::
1326 # Client side event handler
1327 def beforecrawl(caller, eventargs):
1328 # Some process
1329 # ...
1330
1331 # Url is already crawled before (might be determined based on a simple dictionary caching mechanism)
1332 eventargs.ignore= False # Request Ruya to ignore this url during crawl
1333
1334 # ...
1335
1336 def aftercrawl(caller, eventargs):
1337 # Some process
1338 # ...
1339
1340 # Some error occurred during saving crawled data (might be a file or database), abort further crawling
1341 eventargs.cancel= True # Cancel crawling completely
1342
1343 # ...
1344
1345 @see: L{Crawler.bind}
1346 '''
1347
1348 - def __init__(self, level= 0, args= []):
1349 '''
1350 Constructor.
1351
1352 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}.
1353 @param level: Crawl level on which the event was raised.
1354 @type args: U{list<http://docs.python.org/lib/typesseq.html>}
1355 @param args: Additional arguments to be passed back to event handler.
1356
1357 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1358 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1359 '''
1360 self.level= level
1361 self.args= args
1362 self.cancel= False
1363 self.ignore= False
1364
1366 '''
1367 Ruya's L{Crawler} provides event-based callback mechanism during crawl to allow clients to have more control over which urls are crawled.
1368 The events use this object for event communication for B{beforecrawl}, and B{aftercrawl} events.
1369 '''
1370
1371 - def __init__(self, level= 0, document= None, args= []):
1372 '''
1373 Constructor.
1374
1375 @type document: L{Document}.
1376 @param document: L{Document} object used during crawl
1377 @type args: U{list<http://docs.python.org/lib/typesseq.html>}
1378 @param args: Additional arguments to be passed back to event handler.
1379
1380 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1381 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1382 '''
1383 Crawler.EventArgs.__init__(self, level, args)
1384 self.document= document
1385
1387 '''
1388 Ruya's L{Crawler} provides event-based callback mechanism during crawl to allow clients to have more control over which urls are crawled.
1389 The events use this object for event communication for B{includelink} event.
1390 '''
1391
1392 - def __init__(self, level= 0, uri= None, args= []):
1393 '''
1394 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}.
1395 @param level: Crawl level on which the event was raised.
1396 @type uri: L{Uri}
1397 @param uri: The L{Uri} to include in crawl.
1398 @type args: U{list<http://docs.python.org/lib/typesseq.html>}
1399 @param args: Additional arguments to be passed back to event handler.
1400
1401 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1402 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1403 '''
1404 Crawler.EventArgs.__init__(self, level, args)
1405 self.uri= uri
1406
1407 - def crawl(self, document, level= 0):
1408 '''
1409 The main method where actual crawling is performed.
1410
1411 @type document: L{Document}.
1412 @param document: The L{Document} to crawl.
1413 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}.
1414 @param level: The level on which the L{Document} is crawled.
1415
1416 @todo: URL canonicalization http://www.archive.org/index.html and http://www.archive.org/ are same
1417 @todo: Tidy?
1418 @todo: Avoiding slow links? Currently handled by timeout from U{httplib<http://docs.python.org/lib/module-httplib.html>}.
1419 @todo: Support Crawl-Delay from robots.txt?.
1420 @todo: Detecting "soft" 404? http://blog.dtiblog.com/community-hobby.html => http://blog.dtiblog.com/404.html
1421
1422 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>}
1423 @return: (cancel, ignore) values set either internally or explicity by event handlers.
1424 '''
1425 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig
1426 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception
1427 uri= document.uri
1428
1429 debug('Crawler.crawl(): Started...' %locals())
1430
1431 try:
1432 cancel, ignore= False, False
1433 cancel, ignore= self.beforecrawl(document, level)
1434
1435 if(cancel or ignore):
1436
1437 pass
1438
1439 else:
1440 cancel, ignore= False, False
1441 cancel, ignore= self.aftercrawl(document, level)
1442
1443 except:
1444 document.error= Document.DocumentError(Document.DocumentError.DOCERR_INTERNAL, sys.exc_info()[0], sys.exc_info()[1])
1445 exception('Crawler.crawl(): Failed crawling url "%(uri)s"...' %locals())
1446
1447 debug('Crawler.crawl(): Completed. Returning (cancel= %(cancel)s, ignore= %(ignore)s)...' %locals())
1448 return (cancel, ignore)
1449
1451 '''
1452 Ruya's single domain delayed crawler is an enhancement to Ruya's base L{crawler<Crawler>}.
1453 B{This is a U{breadth-first<http://en.wikipedia.org/wiki/Breadth-first_search>} crawler with delay between each crawl request}.
1454 '''
1455 - def __init__(self, config):
1456 '''
1457 Constructor.
1458
1459 @type config: L{Config}.
1460 @param config: The L{configuration<Config>} object to be used while crawling.
1461
1462 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1463 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>}
1464 '''
1465 Crawler.__init__(self, config)
1466
1467 - def crawl(self, document, level= 0):
1468 '''
1469 The main method where actual crawling is performed.
1470
1471 @type document: L{Document}.
1472 @param document: The L{Document} to crawl.
1473 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}.
1474 @param level: The level on which the L{Document} is crawled.
1475
1476 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>}
1477 @return: (cancel, ignore) values set either internally or explicity by event handlers.
1478 '''
1479 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig
1480 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception
1481
1482 debug('SingleDomainDelayCrawler.crawl(): Started...' %locals())
1483
1484 doc, maxlevels= document, cc.levels
1485 starturi, cancel, ignore= doc.uri, False, False
1486
1487 try:
1488 if(0<= level<= maxlevels):
1489 debug('SingleDomainDelayCrawler.crawl(): Starting to crawl url "%(starturi)s" on level %(level)d upto max. %(maxlevels)d level(s)...' %locals())
1490 nextleveldocs, cancel, ignore= self.crawlbreadth(level, maxlevels, starturi, [doc])
1491 totaldocs= len(nextleveldocs)
1492
1493
1494 if(cancel or (0>= totaldocs)):
1495
1496 pass
1497
1498 else:
1499 for level in range(1, maxlevels+1, 1):
1500 debug('SingleDomainDelayCrawler.crawl(): Starting to crawl url "%(starturi)s" on level %(level)d upto max. %(maxlevels)d level(s)...' %locals())
1501 if(cancel):
1502 critical('SingleDomainDelayCrawler.crawl(): Crawling cancelled at level %(level)d as one of the event-handlers requested an abort' %locals())
1503 break
1504
1505 if(ignore):
1506
1507 pass
1508
1509 nextleveldocs, cancel, ignore= self.crawlbreadth(level, maxlevels, starturi, nextleveldocs)
1510
1511 else:
1512 document.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT)
1513 debug('SingleDomainDelayCrawler.crawl(): Cannot crawl url "%(starturi)s" on level %(level)d as it exceeds max. levels %(maxlevels)d...' %locals())
1514
1515 except:
1516 document.error= Document.DocumentError(Document.DocumentError.DOCERR_INTERNAL, sys.exc_info()[0], sys.exc_info()[1])
1517 exception('SingleDomainDelayCrawler.crawl(): Failed crawling url "%(starturi)s"...' %locals())
1518
1519 debug('SingleDomainDelayCrawler.crawl(): Completed crawling url "%(starturi)s" till max. %(maxlevels)d level(s)' %locals())
1520 return (cancel, ignore)
1521
1522 - def crawlbreadth(self, level, maxlevels, domainuri, documents):
1523 '''
1524 The main method where actual breadth-first crawling is performed.
1525
1526 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}.
1527 @param level: The level on which the L{Document} is crawled.
1528 @type maxlevels: U{number<http://docs.python.org/lib/typesnumeric.html>}.
1529 @param maxlevels: L{Maximum number<Config.CrawlConfig.levels>} of levels to crawl.
1530 @type domainuri: L{Uri}.
1531 @param domainuri: Valid instance of L{Uri} object.
1532 @type documents: U{list<http://docs.python.org/lib/typesseq.html>}
1533 @param documents: Documents list to which newly to-be-crawled urls are appended for later crawling.
1534
1535 @attention: Event L{includelink<Crawler.firevents>} is not fired for first L{Uri} where crawl is started, however L{beforecrawl<Crawler.firevents>} event might be fired if url is redirected.
1536 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>}
1537 @return: (nextleveldocs, cancel, ignore) values set either internally or explicity by event handlers.
1538 '''
1539
1540 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig
1541 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception
1542
1543 debug('SingleDomainDelayCrawler.crawlbreadth(): Started...' %locals())
1544
1545 nextleveldocs, nextlevel, cancel, delay, cancel, ignore= [], level+ 1, False, cc.crawldelay, False, False
1546 totaldocs= len(documents)
1547
1548 for docindex in range(0, totaldocs, 1):
1549 cancel, ignore= False, False
1550 doc= documents[docindex]
1551 uri= doc.uri
1552
1553 try:
1554 debug('SingleDomainDelayCrawler.crawlbreadth(): Crawling url "%(uri)s" at level %(level)d...' %locals())
1555 cancel, ignore= Crawler.crawl(self, doc, level)
1556
1557 if(cancel):
1558 critical('SingleDomainDelayCrawler.crawlbreadth(): Crawling cancelled at url "%(uri)s" level %(level)d as one of the event-handlers requested an abort' %locals())
1559 break
1560
1561 if(ignore):
1562 debug('SingleDomainDelayCrawler.crawlbreadth(): Not using further links from url "%(uri)s" level %(level)d as one of the event-handlers requested to ignore' %locals())
1563 continue
1564
1565 cancel, ignore= False, False
1566
1567
1568 if(1<= nextlevel<= maxlevels):
1569 links, ulinks= doc.links, []
1570
1571 for link in links:
1572 cancel, ignore= False, False
1573
1574 debug('SingleDomainDelayCrawler.crawlbreadth(): Firing events before including url "%(link)s" to be crawled at next level %(nextlevel)d...' %locals())
1575 cancel, ignore= self.firevents(self.callbacks['includelink'], SingleDomainDelayCrawler.UriIncludeEventArgs(nextlevel, link, None))
1576
1577 if(cancel):
1578 critical('SingleDomainDelayCrawler.crawlbreadth(): Include link cancelled at url "%(uri)s" level %(level)d as one of the event-handlers requested an abort' %locals())
1579
1580
1581
1582
1583 break
1584
1585 if(ignore):
1586 debug('SingleDomainDelayCrawler.crawlbreadth(): Not including url "%(link)s" for crawling as one of the event-handlers requests not to include it for crawl at next level %(nextlevel)d...' %locals())
1587
1588
1589 ignore= False
1590
1591 continue
1592
1593 ulinks.extend([link])
1594
1595 if(cancel):
1596 pass
1597
1598 else:
1599 totallinks= len(ulinks)
1600 debug('SingleDomainDelayCrawler.crawlbreadth(): Scheduling %(totallinks)d links from url "%(uri)s" at level %(level)d to be crawled at next level %(nextlevel)d...' %locals())
1601 nextleveldocs.extend([Document(ulink) for ulink in ulinks])
1602
1603 else:
1604 debug('SingleDomainDelayCrawler.crawlbreadth(): Not scheduling links for crawling from url "%(uri)s" as next level %(nextlevel)d from current level %(level)d will be already over max. %(maxlevels)d level(s)...' %locals())
1605
1606
1607 if(cancel): break
1608
1609 if((-1+ totaldocs)!= docindex):
1610 debug('SingleDomainDelayCrawler.crawlbreadth(): Pausing for %(delay)s seconds after crawling url "%(uri)s" at level %(level)d...' %locals())
1611 time.sleep(delay)
1612
1613 except:
1614 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_INTERNAL, sys.exc_info()[0], sys.exc_info()[1])
1615 exception('SingleDomainDelayCrawler.crawlbreadth(): Failed crawling url "%(uri)s"...' %locals())
1616 break
1617
1618 debug('SingleDomainDelayCrawler.crawlbreadth(): Completed. Returning (nextleveldocs, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals())
1619 return (nextleveldocs, cancel, ignore)
1620