ruya

1 #!/usr/bin/env python 2 #-*- coding: UTF-8 -*- 3 4 ''' 5 U{Ruya<http://ruya.sourceforge.net/>} I{Arabic name meaning "sight, vision"} is a Python-based crawler for crawling English, Japanese websites. 6 B{It is targeted solely towards developers who want crawling functionality in their code}. 7 Some important features of this tool are- 8 - Extensively uses U{kconv<http://apache.noexistent.com/~mak/kconv/kconv/index_jp.html>} to convert all html content into UTF-8 encoding. 9 - Provides level-, scoped-crawling per website 10 - Supports configuration objects for crawling to suit variety of crawling requirements 11 - Can provide both gzipped, bz2 archived contents in UTF-8 encoding 12 - Can extract meta description, keywords, links, last-modified, e-tag headers from a url 13 - Supports event-based callbacks to allow caller to seamlessly integrate, and control which urls are crawled 14 - Detailed logging support to understand, evaluate exact crawling process 15 16 Example:: 17 #!/usr/bin/env python 18 #-*- coding: UTF-8 -*- 19 20 import ruya 21 22 def test(): 23 url= 'http://www.python.org/' 24 25 # Create a Document instance representing start url 26 doc= ruya.Document(ruya.Uri(url)) 27 28 # Create a new crawler configuration object 29 cfg= ruya.Config(ruya.Config.CrawlConfig(levels= 1, crawldelay= 5), ruya.Config.RedirectConfig(), ruya.Config.LogConfig()) 30 31 # Use a single-domain breadth crawler with crawler configuration 32 c= ruya.SingleDomainDelayCrawler(cfg) 33 34 # Crawler raises following events before crawling a url. 35 # Setup callbacks pointing to custom methods where we can control whether to crawl or ignore a url e.g. to ignore duplicates? 36 c.bind('beforecrawl', beforecrawl, None) 37 c.bind('aftercrawl', aftercrawl, None) 38 c.bind('includelink', includelink, None) 39 40 # Start crawling 41 c.crawl(doc) 42 43 # 44 if(None!= doc.error): 45 print(`doc.error.type`+ ': '+ `doc.error.value`) 46 47 # This callback is invoked from Ruya crawler before a url is to be included in list of urls to be crawled 48 # We can choose to ignore the url based on our custom logic 49 def includelink(caller, eventargs): 50 uri= eventargs.uri 51 level= eventargs.level 52 print 'includelink(): Include "%(uri)s" to crawl on level %(level)d?' %locals() 53 54 # Before a url is actually crawled, Ruya invokes this callback to ask whether to crawl the url or not. 55 # We can choose to ignore the url based on our custom logic 56 def beforecrawl(caller, eventargs): 57 uri= eventargs.document.uri 58 print 'beforecrawl(): "%(uri)s" is about to be crawled...' %locals() 59 60 # After a url is crawled, Ruya invokes this callback where we can check crawled values of a url. 61 def aftercrawl(caller, eventargs): 62 doc= eventargs.document 63 uri= doc.uri 64 65 print 'Url: '+ uri.url 66 print 'Title: '+ doc.title 67 print 'Description: '+ doc.description 68 print 'Keywords: '+ doc.keywords 69 print 'Last-modified: '+ doc.lastmodified 70 print 'Etag: '+ doc.etag 71 72 # Check if any errors occurred during crawl of this url 73 if(None!= doc.error): 74 print 'Error: '+ `doc.error.type` 75 print 'Value: '+ `doc.error.value` 76 77 print 'aftercrawl(): "%(uri)s" has finished crawling...' %locals() 78 79 if('__main__'== __name__): 80 # Test Ruya crawler 81 test() 82 83 For bugs, suggestions, feedback please report to the author. 84 @todo: epydoc-3.0beta1 doesn't support @rtype, @returns for property() yet? 85 ''' 86 87 __author__ = 'NAIK Shantibhushan<qqbb65v59@world.ocn.ne.jp>' 88 __version__ = '1.0' 89 __date__ = '2007-May-06 1441H' 90 __copyright__ = 'Copyright (c) 2005 NAIK Shantibhushan<qqbb65v59@world.ocn.ne.jp>' 91 __license__ = 'Python' 92 93 import sys, re, string, os, time, random 94 import httplib, urllib2, urlparse, sha, kconv, gzip, bz2, htmldata 95 from os import path 96 from robotparser import RobotFileParser 97 from sgmllib import SGMLParser 98 from StringIO import StringIO 99

100 -class CrawlScope(object):

101 ''' 102 Ruya's configuration object to determine which scope will be used for a website while crawling 103 ''' 104 # We cannot provide an ALL crawl scope as domains would differ, and we need to consider crawl level resetting for the same. 105 #: NOT SUPPORTED - Maybe next version? 106 SCOPE_ALL= 100000 107 #: For url I{http://domain.ext/support/index.htm} crawl pages only under host B{I{http://domain.ext/}} 108 SCOPE_HOST= 100001 109 #: For url I{http://domain.ext/support/index.htm} crawl pages from B{domains and sub-domains} under domain I{domain.ext} - B{I{http://domain.ext/}}, B{I{http://second.domain.ext/}}, B{I{http://third.domain.ext/}} 110 SCOPE_DOMAIN= 100002 111 #: For url I{http://domain.ext/support/index.html} crawl pages only under B{folder} - B{I{http://domain.ext/support/}} 112 SCOPE_PATH= 100003 113 114 @classmethod

115 - def isvalidscope(cls, scope):

116 ''' 117 Checks if the scope is valid - one of allowed scopes in L{CrawlScope} 118 119 @type scope: U{number<http://docs.python.org/lib/typesnumeric.html>} 120 @param scope: A valid crawl scope. 121 122 @rtype: U{boolean<http://docs.python.org/lib/truth.html>} 123 @returns: B{True} is crawl scope is valid, B{False} otherwise. 124 ''' 125 return (scope in (CrawlScope.SCOPE_HOST, CrawlScope.SCOPE_DOMAIN, CrawlScope.SCOPE_PATH))

126

127 -class Config(object):

128 ''' 129 Ruya's L{Crawler} uses configuration objects to determine various settings during crawl. 130 It covers 131 - What options to use during crawl - L{Config.CrawlConfig} 132 - How to handle redirects while crawl - L{Config.RedirectConfig} 133 - Where to output descriptive log messages during crawl - L{Config.LogConfig} 134 135 This class simply groups them under a single class for ease of maintenance and usage. 136 It enables a developer to have different configuration profiles, and use them best suited to the requirements. 137 ''' 138

139 - def __init__(self, crawlconfig= None, redirectconfig= None, logconfig= None):

140 ''' 141 Constructor. 142 143 @note: Please refer to B{Instance Variables} section for details on each parameter. 144 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 145 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 146 ''' 147 self.crawlconfig= crawlconfig #: Valid instance of L{Config.CrawlConfig} object. 148 self.redirectconfig= redirectconfig #: Valid instance of L{Config.RedirectConfig} object. 149 self.logconfig= logconfig #: Valid instance of L{Config.LogConfig} object.

150

151 - class CrawlConfig(object):

152 ''' 153 Ruya's crawler configuration object stores settings that are specific during a crawl. 154 It supports all valid settings for a decent, obeying U{crawler<http://en.wikipedia.org/wiki/Web_crawler>}. 155 ''' 156

157 - def __init__(self, 158 useragent= 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)', 159 crawlfrom= '', 160 obeyrobotstxt= True, 161 obeymetarobots= True, 162 acceptencoding= 'gzip, deflate', 163 crawldelay= 120, 164 crawlscope= CrawlScope.SCOPE_HOST, 165 allowedmimes= ['text/html'], 166 allowedextns= ['', '.htm', '.html', '.cgi', '.php', '.jsp', '.cfm', '.asp', '.aspx', '.live', '.do'], 167 levels= 2, 168 maxcontentbytes= 500000, 169 maxcontenttruncate= True, 170 maxretries= 3, 171 retrydelay= 120):

172 ''' 173 Constructor. 174 Provides default values for all settings. 175 176 @note: Please refer to B{Instance Variables} section for details on each parameter. 177 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 178 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 179 ''' 180 self.useragent= useragent #: I{User-agent} header to use during crawl - Specify a valid User-agent while crawling. 181 self.crawlfrom= crawlfrom #: I{From} header to use during crawl - Specify your email address here. 182 self.obeyrobotstxt= obeyrobotstxt #: Whether to obey or ignore robots.txt - If possible, always obey robots.txt during crawl. 183 self.obeymetarobots= obeymetarobots #: Recently U{crawler options<http://www.robotstxt.org/wc/meta-user.html>} are specified directly within html - Specify whether to obey or ignore meta robots. 184 self.acceptencoding= acceptencoding #: I{Accept-encoding} header to use during crawl - Specify whether to accept g-zip content or plain text only. 185 self.crawldelay= crawldelay #: Number of seconds (default B{120 seconds}) to wait before crawling next url within a website 186 # Use valid crawling scope; default to host scope 187 self.crawlscope= CrawlScope.isvalidscope(crawlscope) and crawlscope or CrawlScope.SCOPE_HOST #: L{CrawlScope} to use while crawling a website. 188 self.allowedmimes= allowedmimes #: Valid MIME types to accept (default B{['text/html']}) during crawl. 189 self.allowedextns= allowedextns #: Valid extensions to accept during crawl. 190 self.levels= levels #: Number of levels (default B{2}) to crawl deeper within a website. 191 self.maxcontentbytes= maxcontentbytes #: Upper limit of page-size (default B{500kb}) to download in bytes during crawl. 192 self.maxretries= maxretries #: Number of times to retry (default B{3}) a failed, unavailable url during crawl - Sometimes a url might be temporarily available, but may become available after a while. 193 self.retrydelay= retrydelay #: Number of seconds to wait before retrying (default B{120 seconds}) a failed, unavailable url. 194 # If contents exceed max specified size, we can either truncate them and use or completely discard them 195 self.maxcontenttruncate= maxcontenttruncate #: Whether to use available downloaded contents till L{max. page-size<maxcontentbytes>} (default B{True}). 196 self.maxcontentdiscard= not maxcontenttruncate #: Whether to ignore a page completely if it's size exceeds L{max. page-size<maxcontentbytes>} (default B{False}).

197

198 - class RedirectConfig(object):

199 ''' 200 Ruya's redirect configuration object stores settings specific to handling redirects during a crawl. 201 '''

202 - def __init__(self, allowredirect= True, maxredirects= 10):

203 ''' 204 Constructor. 205 Provides default values for all settings. 206 207 @note: Please refer to B{Instance Variables} section for details on each parameter. 208 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 209 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 210 ''' 211 self.allowredirect= allowredirect #: Whether redirect is allowed or disallowed (default B{True}) 212 self.maxredirects= maxredirects #: If a L{redirect<allowredirect>} is allowed, how many times (default B{10 times}) a url can be redirected before discarding that url.

213

214 - class LogConfig(object):

215 ''' 216 Ruya's logging configuration object stores pointers to user-defined U{logging functions<http://docs.python.org/lib/module-logging.html>}. 217 For each different level of logging, Ruya invokes the method pointer, and outputs descriptive operation messages during a crawl. 218 The different logging functions are based on U{Python's own logging module<http://docs.python.org/lib/module-logging.html>}. 219 220 Sample log output (when using U{Python's own logging module<http://docs.python.org/lib/module-logging.html>})):: 221 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawl(): Started... 222 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawl(): Starting to crawl url "http://webryblog.biglobe.ne.jp/themeindex.html" on level 0 upto max. 2 level(s)... 223 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawlbreadth(): Started... 224 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG SingleDomainDelayCrawler.crawlbreadth(): Crawling url "http://webryblog.biglobe.ne.jp/themeindex.html" at level 0... 225 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.crawl(): Started... 226 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Started... 227 2007-04-15 20:05:21,421 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Firing events before crawling of url "http://webryblog.biglobe.ne.jp/themeindex.html" at level 0... 228 2007-04-15 20:05:21,421 links.py 3516 2112 stderrlog 10 DEBUG SiteLinksExtractor.beforecrawl(): Setting document attributes for url "http://webryblog.biglobe.ne.jp/themeindex.html" on level 0... 229 2007-04-15 20:05:21,437 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): HEAD Crawling url "http://webryblog.biglobe.ne.jp/themeindex.html"... 230 2007-04-15 20:05:21,437 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Obeying "http://webryblog.biglobe.ne.jp/robots.txt" for url "http://webryblog.biglobe.ne.jp/themeindex.html" using "User-agent: <Your user-agent string here>"... 231 2007-04-15 20:05:21,483 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Crawling allowed for url "http://webryblog.biglobe.ne.jp/themeindex.html" using "User-agent: <Your user-agent string here>" as per rules in "http://webryblog.biglobe.ne.jp/robots.txt"... 232 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.beforecrawl(): Completed. Returning (httpstatus= 200, cancel= False, ignore= False)... 233 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.aftercrawl(): Started... 234 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.aftercrawl(): 200 OK for url "http://webryblog.biglobe.ne.jp/themeindex.html"... 235 2007-04-15 20:05:21,500 ruya.py 3516 2112 stderrlog 10 DEBUG Crawler.aftercrawl(): GET Crawling url "http://webryblog.biglobe.ne.jp/themeindex.html"... 236 ... 237 ''' 238

239 - def __init__(self, 240 log= lambda msg: sys.stderr.write(msg+ '\n'), 241 debug= lambda msg: sys.stderr.write(msg+ '\n'), 242 info= lambda msg: sys.stderr.write(msg+ '\n'), 243 warning= lambda msg: sys.stderr.write(msg+ '\n'), 244 error= lambda msg: sys.stderr.write(msg+ '\n'), 245 critical= lambda msg: sys.stderr.write(msg+ '\n'), 246 exception= lambda msg: sys.stderr.write(msg+ '\n')):

247 ''' 248 Constructor. 249 250 @note: Please refer to B{Instance Variables} section for details on each parameter. 251 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 252 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 253 ''' 254 self.log= log #: Default function for logging without any level (default B{sys.stderr}) 255 self.debug= debug #: Logging function with level DEBUG (default B{sys.stderr}). 256 self.info= info #: Logging function with level INFO (default B{sys.stderr}). 257 self.warning= warning #: Logging function with level WARNING (default B{sys.stderr}). 258 self.error= error #: Logging function with level ERROR (default B{sys.stderr}). 259 self.critical= critical #: Logging function with level CRITICAL (default B{sys.stderr}). 260 self.exception= exception #: Logging function with level EXCEPTION (default B{sys.stderr}).

261

262 -class Uri(object):

263 ''' 264 Ruya's Uri object encapsulates an http url used while crawling. 265 It provides ready to use methods to obtain robots.txt path, domains by examining a url, and L{scope<CrawlScope>} checks on two urls. 266 ''' 267

268 - def __init__(self, url):

269 ''' 270 Constructor. 271 272 @type url: U{str<http://docs.python.org/lib/typesseq.html>} 273 @param url: The actual url to be used for representation. 274 275 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 276 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 277 ''' 278 if(None== url): url= '' 279 self.url= urlparse.urldefrag(url)[0] #: Url with querystring removed. 280 self.hash= sha.new(url).hexdigest() #: SHA has for L{url}.

281

282 - def getDomainUrl(self):

283 ''' 284 Returns the domain found after analyzing the url. 285 ''' 286 u= urlparse.urlsplit(self.url) 287 ru= u.scheme+ '://'+ u.netloc+ '/' 288 return ru

289

290 - def getRobotsTxtUrl(self):

291 ''' 292 Returns the robots.txt path for a url. 293 Usually, I{http://domain.ext/} has robots.txt placed in it's root as B{I{http://domain.ext/robots.txt}}. 294 ''' 295 rt= self.getDomainUrl()+ 'robots.txt' 296 return rt

297

298 - def getDomains(self):

299 ''' 300 Returns valid domains found after analyzing the url. 301 I{http://www.domain.ext/} and I{http://domain.ext/} both point to the same domain B{I{domain.ext}}, so they must be considered same. 302 This function assists the crawler when determining if two urls are from L{same domain<issamedomain>}. 303 ''' 304 dms= [] 305 306 u= urlparse.urlsplit(self.url) 307 if(None!= u.hostname): 308 dms.extend([u.hostname]) 309 if(-1!= u.hostname.find('www.')): 310 dms.extend([u.hostname.replace('www.', '')]) 311 312 return dms

313

314 - def getHashes(self):

315 ''' 316 Returns valid SHA hashes for url string. 317 Two different hashes will be returned if url domain starts with B{www} as I{http://www.domain.ext/} and I{http://domain.ext/} both point to the same domain B{I{domain.ext}}. 318 ''' 319 hashes= [sha.new(self.url).hexdigest(), sha.new(self.url.replace('www.', '')).hexdigest()] 320 return hashes

321

322 - def getParts(self):

323 ''' 324 Returns a tuple consisting of various parts of a url. 325 326 @see: U{urlparse<http://docs.python.org/lib/module-urlparse.html>} 327 ''' 328 return urlparse.urlparse(self.url)

329 330 parts= property(fget= getParts) 331 domainurl= property(fget= getDomainUrl) 332 robotstxturl= property(fget= getRobotsTxtUrl) 333 domains= property(fget= getDomains) 334 hashes= property(fget= getHashes) 335

336 - def join(self, uri):

337 ''' 338 Joins two Uri objects and returns a new Uri object. 339 340 @rtype: L{Uri} 341 @returns: Joined L{Uri} instance. 342 ''' 343 return Uri(urlparse.urljoin(self.url, uri.url))

344

345 - def issamedomain(self, uri):

346 ''' 347 Determines of two urls belong to the same domain. 348 - I{http://B{domain.ext}/page1.htm} has same domain as I{http://B{domain.ext}/page2.htm}. 349 - I{http://B{domain.ext}/page1.htm} has same domain as I{http://B{www.domain.ext}/page2.htm} since I{http://www.domain.ext/} and I{http://domain.ext/} both point to the same domain B{I{domain.ext}}. 350 351 352 @type uri: L{Uri}. 353 @param uri: Valid instance of L{Uri} object. 354 @rtype: U{boolean<http://docs.python.org/lib/truth.html>} 355 @returns: True if urls belong to the same domain. 356 ''' 357 result= False 358 359 if(None!= uri): 360 # Match base urls, most of the time we should get if they are from same domain here 361 result= (self.domainurl.lower()== uri.domainurl.lower()) 362 363 if(not result): 364 # If base urls don't match, then check possible domains 365 domains= [domain for domain in uri.domains if domain in self.domains] 366 result= (1<= len(domains)) 367 368 return result

369

370 - def ishostscope(self, uri):

371 ''' 372 Determines if two urls belong to the same host 373 - I{http://B{domain.ext}/page1.htm} has same host (domain) as I{http://B{domain.ext}/page2.htm} 374 - I{http://B{domain.ext}/page1.htm} does not have same host (domain) as I{http://B{otherdomain.ext}/page2.htm} 375 @see: U{issamedomain} 376 377 @type uri: L{Uri}. 378 @param uri: Valid instance of L{Uri} object. 379 @rtype: U{boolean<http://docs.python.org/lib/truth.html>} 380 @returns: True if urls belong to the same domain (host). 381 ''' 382 return self.issamedomain(uri)

383

384 - def isdomainscope(self, uri):

385 ''' 386 Determines two urls have same domain or either of the urls comes from a sub-domain of the other url. 387 - I{http://B{domain.ext}/page1.htm} comes from the same domain as I{http://B{domain.ext}/page2.htm} 388 - I{http://B{example.domain.ext}/page1.htm} comes from a sub-domain as I{http://B{domain.ext}/page2.htm}. 389 B{example.domain.ext} is a sub-domain of B{domain.ext}. 390 - I{http://B{domain.ext}/page1.htm} does not come from same domain, or sub-domain as I{http://B{otherdomain.ext}/page2.htm} 391 392 @note: Sub-domain is simply determined if B{example.domain.ext} I{ends} in B{domain.ext}. 393 @rtype: U{boolean<http://docs.python.org/lib/truth.html>} 394 @returns: True if urls belong to the same domain or either of the urls comes from a sub-domain of the other url. 395 ''' 396 result= False 397 398 if(None!= uri): 399 # media.archive.org should give archive.org 400 d1= self.parts.hostname.replace('www', '') 401 d2= d1.split('.') 402 # Base domain is entire domain without www. 403 basedomain= d1 404 405 # audio.archive.org 406 # NOTE: hostname can be None http://docs.python.org/lib/module-urlparse.html 407 otherdomain= uri.parts.hostname and uri.parts.hostname or '' 408 409 # audio.archive.org is a sub-domain of archive.org? 410 result= otherdomain.endswith(basedomain) 411 412 return result

413

414 - def ispathscope(self, uri):

415 ''' 416 Determines if two urls belong to the same folder. 417 - I{http://B{domain.ext/support}/page1.htm} belongs to the same folder B{support} as I{http://B{domain.ext/support}/page2.htm} 418 - I{http://B{domain.ext}/index.htm} does not belong to same folder B{support} as I{http://B{domain.ext/support}/page2.htm} 419 420 @type uri: L{Uri}. 421 @param uri: Valid instance of L{Uri} object. 422 @rtype: U{boolean<http://docs.python.org/lib/truth.html>} 423 @returns: True if urls belong to the same folder. 424 ''' 425 result= False 426 427 if(None!= result): 428 # http://members.aol.com/~bigbird/ should give members.aol.com/~bigbird 429 basepath= self.parts.netloc+ os.path.split(self.parts.path)[0] 430 # http://members.aol.com/~bigbird/profile should give members.aol.com/~bigbird/profile 431 otherpath= uri.parts.netloc+ uri.parts.path 432 433 # members.aol.com/~bigbird/profile comes from same folder as members.aol.com/~bigbird ? 434 result= otherpath.startswith(basepath) 435 436 return result

437

438 - def __str__(self):

439 ''' 440 String representation of the url. 441 442 @rtype: U{str<http://docs.python.org/lib/typesseq.html>} 443 @returns: String representation of the url. 444 ''' 445 return self.url

446 447 __repr__= __str__ #: Same as string representation. 448 449 # http://www.python.org/doc/current/ref/customization.html

450 - def __eq__(self, uri):

451 ''' 452 Determines if two urls are identical by comparing their SHA hashes 453 454 @type uri: L{Uri}. 455 @param uri: Valid instance of L{Uri} object. 456 @rtype: U{boolean<http://docs.python.org/lib/truth.html>} 457 @returns: True if urls are identical, False otherwise. 458 ''' 459 if(None== uri): 460 return False 461 else: 462 return (uri.hash== self.hash)

463

464 - def __ne__(self, uri):

465 ''' 466 Determines if two urls are not identical by comparing their SHA hashes 467 468 @type uri: L{Uri}. 469 @param uri: Valid instance of L{Uri} object. 470 @rtype: U{boolean<http://docs.python.org/lib/truth.html>} 471 @returns: True if urls are not identical, False otherwise. 472 ''' 473 if(None== uri): 474 return True 475 else: 476 return (uri.hash!= self.hash)

477

478 -class Document(object):

479 ''' 480 Ruya's document object represents an html document. 481 It provides ready to use access to document's http headers and various other properties such as title, keywords etc. 482 It also allows to access plain html contents as gzipped, or bz2 archived. 483 ''' 484

485 - def __init__(self, uri, lastmodified= '', etag= ''):

486 ''' 487 Constructor. 488 489 @type uri: L{Uri}. 490 @param uri: Valid instance of L{Uri} object. 491 @type lastmodified: U{str<http://docs.python.org/lib/typesseq.html>} 492 @param lastmodified: I{Last-modified} header value obtained from last crawl, if any. 493 @type etag: U{str<http://docs.python.org/lib/typesseq.html>} 494 @param etag: I{Etag} header value obtained from last crawl, if any. 495 496 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 497 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 498 ''' 499 self.headers= {} #: HTTP headers for this document. 500 self._uri= uri #: HTTP url of this document. 501 self.title= '' #: Title of this document obtained from <title> tag. 502 self.description= '' #: Description of this document obtained from <meta name=description...> tag. 503 self.keywords= '' #: Description of this document obtained from <meta name=keywords...> tag. 504 self.lastmodified= lastmodified #: I{Last-modified} header for this document - Can be used to avoid recrawling document if contents are not changed. 505 self.etag= etag #: I{Etag} header for this document - Can be used to avoid recrawling document if contents are not changed. 506 self.httpstatus= 200 #: HTTP status obtained while crawling this document. 507 self.httpreason= '' #: HTTP reason obtained while crawling this document. 508 self.contenttype= '' #: I{Content-type} header for this document. 509 self.contentencoding= '' #: I{Content-encoding} header for this document. 510 self._zippedcontent= '' #: gzipped contents for this document. 511 self._isZipped= False #: Internal flag to remember if gzip operation is already done for plain contents. 512 self._bzippedcontent= '' #: bz2 archived contents for this document. 513 self._isBzipped= False #: Internal flag to remember if bz2 archive operation is already done for plain contents. 514 self._plaincontent= '' #: Plain contents for this document. 515 self.links= [] #: All crawlable links found in this document. 516 self.redirecturi= None #: Actual url of this document if this document was redirected from L{uri}. 517 self.redirects= 0 #: Number of times this document was redirected from L{uri}. 518 self.redirecturis= [] #: All redirected urls (matching L{redirects}) which were crawled while crawling document L{uri}. 519 self.error= None #: L{DocumentError} object if error occurred during crawl for this document. 520 521 # Newlines 522 self._cleandata= re.compile('(?msix)[\r\n\f\v]+') #: Regular expression to match newlines

523

524 - def getUri(self):

525 ''' 526 Returns the url for this document. 527 ''' 528 return self._uri

529 530 uri= property(fget= getUri) 531

532 - def getNormalizedLinks(self):

533 ''' 534 Returns all links from this document converted to absolute links with reference to document's L(uri). 535 ''' 536 nlinks= [] 537 for link in self.links: 538 nlinks.extend([self.uri.join(link)]) 539 540 return nlinks

541 542 normalizedlinks= property(fget= getNormalizedLinks) 543

544 - def getZippedContent(self):

545 ''' 546 Returns gzipped content for this document. 547 @note: The content is gzipped with the maximum compression level of 9. 548 @see: U{gzip<http://docs.python.org/lib/module-gzip.html>} 549 ''' 550 zc= '' 551 552 if(self._isZipped): 553 # Use already zipped content 554 zc= self._zippedcontent 555 556 else: 557 if(0!= len(self.plaincontent)): 558 try: 559 pc= StringIO() 560 pcz= gzip.GzipFile(None, 'wb', 9, pc) 561 pcz.write(self.plaincontent) 562 pcz.close() 563 zc= pc.getvalue() 564 565 # Save archived contents 566 self._zippedcontent= zc 567 self._isZipped= True 568 569 except: 570 zc= '' 571 572 return zc

573

574 - def setZippedContent(self, data):

575 ''' 576 Sets the gzipped content for the document. 577 @note: The content is unzipped assuming the compression level of 9. 578 @see: U{gzip<http://docs.python.org/lib/module-gzip.html>} 579 ''' 580 if(None== data): data= '' 581 582 self.plaincontent= '' 583 self._zippedcontent= '' 584 585 if(0!= len(data)): 586 try: 587 pcz= StringIO(data) 588 zc= gzip.GzipFile(None, 'rb', 9, pcz) 589 self.plaincontent= zc.read() 590 zc.close() 591 592 # If no errors occured, we can safely use this zipped content 593 self._zippedcontent= data 594 self._isZipped= True 595 596 except: 597 pass

598 599 zippedcontent= property(fget= getZippedContent, fset= setZippedContent) 600

601 - def getBzippedContent(self):

602 ''' 603 Return bz2 archived contents for this document. 604 @note: The content is bz2 archived with the maximum compression level of 9. 605 @see: U{bz2<http://docs.python.org/lib/module-bz2.html>} 606 ''' 607 zc= '' 608 609 if(self._isBzipped): 610 # Use already zipped content 611 zc= self._bzippedcontent 612 613 else: 614 if(0!= len(self.plaincontent)): 615 try: 616 zc= bz2.compress(self.plaincontent, 9) 617 618 # Save archived contents 619 self._bzippedcontent= zc 620 self._isBzipped= True 621 622 except: 623 zc= '' 624 625 return zc

626

627 - def setBzippedContent(self, data):

628 ''' 629 Sets the bz2 archived contents for this document. 630 @see: U{bz2<http://docs.python.org/lib/module-bz2.html>} 631 ''' 632 if(None== data): data= '' 633 634 self.plaincontent= '' 635 self._bzippedcontent= '' 636 637 if(0!= len(data)): 638 try: 639 self.plaincontent= bz2.uncompress(data) 640 641 # If no errors occured, we can safely use this zipped content 642 self._bzippedcontent= data 643 self._isBzipped= True 644 645 except: 646 pass

647 648 bzippedcontent= property(fget= getBzippedContent, fset= setBzippedContent) 649

650 - def getPlainContent(self):

651 ''' 652 Returns the plain html content for this document. 653 ''' 654 return self._plaincontent

655

656 - def setPlainContent(self, data):

657 ''' 658 Sets the plain html content for this document. 659 @note: Empty lines are removed from the plain contents. 660 ''' 661 if(None== data): 662 data= '' 663 else: 664 # Remove extraneous newlines 665 data= self._cleandata.sub('', data) 666 667 self._plaincontent= data 668 self._isZipped, self._isBzipped= False, False

669 670 plaincontent= property(fget= getPlainContent, fset= setPlainContent) 671

672 - def getContentHash(self):

673 ''' 674 Returns the SHA hash for plain contents of this document. 675 ''' 676 cc= self.plaincontent 677 cchash= '' 678 679 if(0!= len(cc)): 680 cchash= sha.new(cc).hexdigest() 681 682 return cchash

683 684 hash= property(fget= getContentHash) 685

686 - class DocumentError(object):

687 ''' 688 Ruya's document error object represents crawl error occurred during crawl of a L{Document}. 689 ''' 690 DOCERR_INTERNAL= 100000 #: Unknown internal error 691 # Using kconv, rarely some documents might fail during charset conversion 692 DOCERR_CONVERSION= 100001 #: Failed to convert document contents to UTF-8 693 DOCERR_MAXREDIRECT= 100002 #: L{Maximum redirects<Config.RedirectConfig.maxredirects>} were exceeded but L{Document} is still redirected further. 694 DOCERR_NO_REDIRECTALLOW= 100003 #: L{Allow redirect<Config.RedirectConfig.allowredirect>} is set to False. 695 DOCERR_URLERROR= 100004 #: URL is invalid. 696 DOCERR_UNHANDLED_HTTPSTATUS= 100005 #: HTTP status is not handled within Ruya - Ruya handles U{200, 301, 302, 307, 404, 416<http://docs.python.org/lib/module-httplib.html>} only. 697 DOCERR_CRAWL_NOTALLOW= 100006 #: Crawling is not allowed as specified by L{robots.txt<Uri.robotstxturl>} for this site. 698 DOCERR_INVALID_CRAWLSCOPE= 100007 #: L{Crawl scope<CrawlScope>} is invalid. 699 DOCERR_INVALID_MIME= 100009 #: L{Mime types<Config.CrawlConfig.allowedmimes>} is not allowed for crawl. 700 DOCERR_NOHTML= 100010 #: HTML contents are not present for the url - Occurs when I{Content-Type} is I{Varying} (not handled within Ruya). 701

702 - def __init__(self, code= DOCERR_INTERNAL, type= None, value= None):

703 ''' 704 Constructor. 705 706 @note: Please refer to B{Instance Variables} section for details on each parameter. 707 708 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 709 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 710 ''' 711 self.code= code #: Error code. (default L{DOCERR_INTERNAL}) 712 self.type= type #: U{Type<http://docs.python.org/lib/bltin-type-objects.html>} of error. 713 self.value= value #: Value of error.

714

715 - def __str__(self):

716 ''' 717 String representation of this object. 718 719 @rtype: U{str<http://docs.python.org/lib/typesseq.html>} 720 @returns: String representation of the url. 721 ''' 722 return `self.code`+ ' '+ `self.type`+ ': '+ `self.value`

723 724 __repr__= __str__ #: Same as string representation.

725

726 -class Crawler(object):

727 ''' 728 Ruya's main object is the Crawler object. 729 This object uses L{configuration<Config>} settings, and performs a crawl on given L{url<Uri>}. 730 Developers can extend Ruya's Crawler and create more sophisticated crawlers similar to Ruya's L{SingleDomainDelayCrawler}. 731 '''

732 - def __init__(self, config):

733 ''' 734 Constructor. 735 736 @type config: L{Config}. 737 @param config: The L{configuration<Config>} object to be used while crawling. 738 739 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 740 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 741 ''' 742 self.config= config 743 # http://apache.noexistent.com/~mak/kconv/kconv/readme.txt 744 self.kc= kconv.Kconv(outcode= kconv.UTF8, incode= kconv.AUTO, hankanaconvert= kconv.HANKAKU, checkmode= kconv.TABLE2, mode= kconv.LINE) 745 #: During crawl of a url, these events are invoked - clients can subscribe to these events to provide a finer level of control over crawling. 746 #: The events that client can subscribe are "beforecrawl", "aftercrawl" and "includelink" 747 #: @see: L{bind} 748 self.callbacks= {'beforecrawl': [], 749 'aftercrawl': [], 750 'includelink': []}

751

752 - def bind(self, event, eventhandler, addnleventargs):

753 ''' 754 Binds eventhandler, callback (pointer to a function) to one of Ruya's events. 755 Example:: 756 crawlerobj.bind('beforecrawl', myfunction, None) 757 ... 758 759 def myfunction(caller, eventargs): 760 ... 761 762 @note: The eventhandler should have signature as B{func(caller, eventargs)} 763 @type event: U{str<http://docs.python.org/lib/typesseq.html>}. 764 @param event: Must be one of the following values- I{beforecrawl}, I{aftercrawl}, I{includelink}. 765 @type eventhandler: U{function<http://docs.python.org/lib/typesfunctions.html>} 766 @param eventhandler: User-defined function having function signature as B{function(caller, eventargs)} 767 @type addnleventargs: U{list<http://docs.python.org/lib/typesseq.html>}. 768 @param addnleventargs: Additional L{event arguments<Crawler.EventArgs>} to be passed when calling eventhandler. 769 @see: L{callbacks} 770 771 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 772 @return: U{None<http://docs.python.org/lib/bltin-null-object.html>}. 773 ''' 774 self.callbacks[event].append([eventhandler, addnleventargs])

775

776 - def firevents(self, events, eargs):

777 ''' 778 Fires eventhandlers, callbacks (pointer to a function) for one of Ruya's events. 779 780 @type events: U{list<http://docs.python.org/lib/typesseq.html>} 781 @param events: List of callbacks (eventhandlers) to invoke. 782 @type eargs: L{Crawler.EventArgs}. 783 @param eargs: Additional L{event arguments<Crawler.EventArgs>} to pass while invoking event handlers. 784 @see: L{bind} 785 @note: While invoking multiple event-handlers sequentially, if any of the event-handlers sets L{ignore<Crawler.EventArgs.ignore>} to True, it is remembered, and cannot be reset by any event handler in the chain. 786 787 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>} 788 @return: (cancel, ignore) values set either internally or explicity by event handlers. 789 ''' 790 cancel, ignore= False, False 791 792 for (event, eventargs) in events: 793 ignore2, cancel= False, False 794 eargs.cancel, eargs.args= False, eventargs 795 event(self, eargs) 796 797 # Temporarily use ignore2 to get actual ignore value set by event-handler; if true, this will set the actual ignore value. 798 ignore2, cancel= eargs.ignore, eargs.cancel 799 if(cancel): break 800 801 # TODO: What happens to ignore in case of multiple event-handlers? 802 # If ignore= True by any one of the event handlers, remember it 803 if(ignore2): ignore= True 804 805 return (cancel, ignore)

806

807 - def beforecrawl(self, document, level= 0):

808 ''' 809 Performs a U{HEAD<http://www.w3.org/Protocols/rfc2616/rfc2616-sec9.html>} crawl on L{Document}'s url. 810 The L{beforecrawl<callbacks>} events are fired before a url is crawled. 811 It uses headers from L{Document} instance, and uses L{robots.txt<Config.CrawlConfig.obeyrobotstxt>} rules while crawling if allowed. 812 813 @note: As redirects are also handled, the L{beforecrawl<callbacks>} event can be fired multiple times if a url is redirected to another url. 814 @type document: L{Document}. 815 @param document: Valid instance of L{Document} object. 816 @type level: number. 817 @param level: The current level of the document being crawled. 818 819 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>} 820 @return: (cancel, ignore) values set either internally or explicity by event handlers. 821 ''' 822 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig 823 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception 824 825 debug('Crawler.beforecrawl(): Started...' %locals()) 826 827 uri, doc, maxcontentbytes, truncate, discard= \ 828 (None== document.redirecturi) and document.uri or document.redirecturi, document, cc.maxcontentbytes, cc.maxcontenttruncate, cc.maxcontentdiscard 829 830 httpstatus, httpreason, headers, cancel, ignore= 0, '', {}, False, False 831 debug('Crawler.beforecrawl(): Firing events before crawling of url "%(uri)s" at level %(level)d...' %locals()) 832 833 cancel, ignore= self.firevents(self.callbacks['beforecrawl'], Crawler.CrawlEventArgs(level, doc, None)) 834 835 if(cancel): 836 critical('Crawler.beforecrawl(): Cancelling crawl at url "%(uri)s" level %(level)d as one of the event-handlers requested to cancel...' %locals()) 837 838 elif(ignore): 839 warning('Crawler.beforecrawl(): Ignoring url "%(uri)s" level %(level)d as one of the event-handlers requested to ignore...' %locals()) 840 841 else: 842 cancel, ignore= False, False 843 httpcon, response= None, None 844 crawl_allowed= True 845 846 debug('Crawler.beforecrawl(): HEAD Crawling url "%(uri)s"...' %locals()) 847 defhdrs= { 848 'User-agent': cc.useragent, 849 'From': cc.crawlfrom, 850 'Accept-encoding': cc.acceptencoding, 851 'Accept-charset': 'utf-8', 852 'Accept': ','.join(cc.allowedmimes), 853 'If-none-match': doc.etag, 854 'If-modified-since': doc.lastmodified, 855 'Range': '0-%(maxcontentbytes)d' %locals() 856 } 857 858 if(cc.obeyrobotstxt): 859 cancel, ignore= False, False 860 robotstxt, useragent= uri.robotstxturl, cc.useragent 861 debug('Crawler.beforecrawl(): Obeying "%(robotstxt)s" for url "%(uri)s" using "User-agent: %(useragent)s"...' %locals()) 862 863 rp= RobotFileParser() 864 rp.set_url(robotstxt) 865 rp.read() 866 867 crawl_allowed= rp.can_fetch(useragent, uri.parts.path) 868 869 allow= crawl_allowed and 'allowed' or 'not allowed' 870 cancel= not crawl_allowed 871 872 debug('Crawler.beforecrawl(): Crawling %(allow)s for url "%(uri)s" using "User-agent: %(useragent)s" as per rules in "%(robotstxt)s"...' %locals()) 873 874 else: 875 debug('Crawler.beforecrawl(): Ignoring "%(robotstxt)s" for url "%(uri)s"...' %locals()) 876 877 if(crawl_allowed): 878 cancel, ignore= False, False 879 880 # Few sites return a response 416 (range error), 406 (not acceptable) so we need to handle it 881 errhttpstatuses= [httplib.REQUESTED_RANGE_NOT_SATISFIABLE, httplib.NOT_ACCEPTABLE] 882 # Give some dummy value so that we enter while loop 883 httpstatus= errhttpstatuses[random.randint(0, -1+ len(errhttpstatuses))] 884 885 while(httpstatus in errhttpstatuses): 886 retry, maxretries, retrydelay= 1, cc.maxretries, cc.retrydelay 887 888 while(1<= retry<= maxretries): 889 try: 890 httpcon= httplib.HTTPConnection(uri.parts.netloc) 891 httpcon.request('HEAD', uri.parts.path, None, defhdrs) 892 893 response= httpcon.getresponse() 894 895 httpstatus, httpreason= response.status, response.reason 896 headers.clear(), headers.update(response.getheaders()) 897 898 # Few sites return a response 416 (range error),so we need to handle it 899 if(httplib.REQUESTED_RANGE_NOT_SATISFIABLE== httpstatus): 900 reqrange= defhdrs['Range'] 901 debug('Crawler.beforecrawl(): %(httpstatus)d %(httpreason)s range "%(reqrange)s" bytes for url "%(uri)s". Retrying again without "Range"..' %locals()) 902 del defhdrs['Range'] 903 response.close(), httpcon.close() 904 905 # IF Accept-charset cannot be satisfied, sites should return 406 (not acceptable) error 906 if(httplib.NOT_ACCEPTABLE== httpstatus): 907 reqcharset= defhdrs['Accept-charset'] 908 debug('Crawler.beforecrawl(): %(httpstatus)d %(httpreason)s charset "%(reqcharset)s" for url "%(uri)s". Retrying again without "Accept-charset"..' %locals()) 909 del defhdrs['Accept-charset'] 910 response.close(), httpcon.close() 911 912 except: 913 response.close(), httpcon.close() 914 915 if(retry== maxretries): 916 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT, sys.exc_info()[0], sys.exc_info()[1]) 917 918 else: 919 exception('Crawler.beforecrawl(): Retrying again after %(retrydelay)d seconds till max. %(maxretries)d retries. %(retry)d retry(ies) till now...' %locals()) 920 retry+= 1 921 time.sleep(retrydelay) 922 923 else: 924 break 925 926 # Close previous response, connection 927 response.close(), httpcon.close() 928 929 # Check for response statuses, and take according action 930 if(httpstatus in (httplib.MOVED_PERMANENTLY, httplib.TEMPORARY_REDIRECT, httplib.FOUND)): 931 cancel, ignore= False, False 932 933 rediruri= Uri(headers.get('location', '')) 934 935 debug('Crawler.beforecrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s" to "%(rediruri)s"...' %locals()) 936 937 # Retrieve the current redirect information from document object 938 redirects= abs(doc.redirects) 939 maxredirects= rc.maxredirects 940 941 if(rc.allowredirect): 942 cancel, ignore= False, False 943 debug('Crawler.beforecrawl(): Auto-redirect url "%(uri)s" to "%(rediruri)s"...' %locals()) 944 945 if(0<= redirects <= maxredirects): 946 redirects+= 1 947 doc.redirects= redirects 948 doc.redirecturi= rediruri 949 doc.redirecturis.append((doc.redirects, httpstatus, rediruri)) 950 951 # Re-crawl again the new url 952 cancel, ignore= self.beforecrawl(doc, level) 953 954 # Since we are calling same method recursively we need to exit so that we don't accidentally overwrite any attributes to document object 955 debug('Crawler.beforecrawl(): Completed. Returning (httpstatus= %(httpstatus)d, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals()) 956 return (cancel, ignore) 957 958 else: 959 cancel= True 960 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT) 961 962 debug('Crawler.beforecrawl(): Max. auto-redirect url "%(uri)s" to "%(rediruri)s" exceeded %(maxredirects)d max. redirects. Will not crawl further...' %locals()) 963 964 else: 965 cancel= True 966 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_NO_REDIRECTALLOW) 967 968 debug('Crawler.beforecrawl(): Will not auto-redirect url "%(uri)s" to "%(rediruri)s" as allowredirect= False...' %locals()) 969 970 else: 971 # httpstatus might be OK 972 pass 973 974 else: 975 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_CRAWL_NOTALLOW) 976 warning('Crawler.beforecrawl(): Not crawling url "%(uri)s"...' %locals()) 977 978 # Document object will be used by caller to determine httpstatus, reason and headers from HEAD request; we set them here 979 doc.httpstatus, doc.httpreason= httpstatus, httpreason 980 doc.headers.clear(), doc.headers.update(headers) 981 982 debug('Crawler.beforecrawl(): Completed. Returning (httpstatus= %(httpstatus)d, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals()) 983 return (cancel, ignore)

984

985 - def aftercrawl(self, document, level= 0):

986 ''' 987 Performs a U{GET<http://www.w3.org/Protocols/rfc2616/rfc2616-sec9.html>} crawl on L{Document}'s url. 988 The L{aftercrawl<callbacks>} events are fired after a url is crawled L{successfully<Document.error>}. 989 The attributes for L{Document} object are extracted in this method. 990 The L{CrawlScope} is considered before including any L{links<Document.links>} for the L{Document}. 991 992 @type document: L{Document}. 993 @param document: Valid instance of L{Document} object. 994 @type level: number. 995 @param level: The current level of the document being crawled. 996 997 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>} 998 @return: (cancel, ignore) values set either internally or explicity by event handlers. 999 ''' 1000 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig 1001 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception 1002 1003 debug('Crawler.aftercrawl(): Started...' %locals()) 1004 1005 uri, doc, maxcontentbytes, truncate, discard= \ 1006 (None== document.redirecturi) and document.uri or document.redirecturi, document, cc.maxcontentbytes, cc.maxcontenttruncate, cc.maxcontentdiscard 1007 1008 # Use status set in beforecrawl 1009 httpstatus, httpreason, headers, cancel, ignore= doc.httpstatus, doc.httpreason, doc.headers, False, False 1010 1011 if(httpstatus in (httplib.OK, httplib.NOT_MODIFIED)): 1012 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals()) 1013 1014 cancel, ignore= False, False 1015 httpcon, response= None, None 1016 html= '' 1017 1018 if(httplib.NOT_MODIFIED== httpstatus): 1019 debug('Crawler.aftercrawl(): Using previously crawled body (if either document.plaincontent or document.zippedcontent were set)...' %locals()) 1020 html= doc.plaincontent 1021 1022 elif(httplib.OK== httpstatus): 1023 debug('Crawler.aftercrawl(): GET Crawling url "%(uri)s"...' %locals()) 1024 defhdrs= { 1025 'User-agent': cc.useragent, 1026 'From': cc.crawlfrom, 1027 'Accept-encoding': cc.acceptencoding, 1028 'Accept-charset': 'utf-8', 1029 'Accept': ','.join(cc.allowedmimes), 1030 'If-none-match': doc.etag, 1031 'If-modified-since': doc.lastmodified, 1032 'Range': '0-%(maxcontentbytes)d' %locals() 1033 } 1034 1035 # Few sites return a response 416 (range error), 406 (not acceptable) so we need to handle it 1036 errhttpstatuses= [httplib.REQUESTED_RANGE_NOT_SATISFIABLE, httplib.NOT_ACCEPTABLE] 1037 # Give some dummy value so that we enter while loop 1038 httpstatus= errhttpstatuses[random.randint(0, -1+ len(errhttpstatuses))] 1039 1040 while(httpstatus in errhttpstatuses): 1041 retry, maxretries, retrydelay= 1, cc.maxretries, cc.retrydelay 1042 1043 while(1<= retry<= maxretries): 1044 try: 1045 httpcon= httplib.HTTPConnection(uri.parts.netloc) 1046 httpcon.request('GET', uri.parts.path, None, defhdrs) 1047 1048 response= httpcon.getresponse() 1049 1050 httpstatus, httpreason= response.status, response.reason 1051 headers.clear(), headers.update(response.getheaders()) 1052 1053 # Few sites return a response 416 (range error),so we need to handle it 1054 if(httplib.REQUESTED_RANGE_NOT_SATISFIABLE== httpstatus): 1055 reqrange= defhdrs['Range'] 1056 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s range "%(reqrange)s" bytes for url "%(uri)s". Retrying again without "Range"..' %locals()) 1057 del defhdrs['Range'] 1058 response.close(), httpcon.close() 1059 1060 # IF Accept-charset cannot be satisfied, sites should return 406 (not acceptable) error 1061 if(httplib.NOT_ACCEPTABLE== httpstatus): 1062 reqcharset= defhdrs['Accept-charset'] 1063 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s charset "%(reqcharset)s" for url "%(uri)s". Retrying again without "Accept-charset"..' %locals()) 1064 del defhdrs['Accept-charset'] 1065 response.close(), httpcon.close() 1066 1067 except: 1068 response.close(), httpcon.close() 1069 1070 if(retry== maxretries): 1071 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT, sys.exc_info()[0], sys.exc_info()[1]) 1072 1073 else: 1074 exception('Crawler.aftercrawl(): Retrying again after %(retrydelay)d seconds till max. %(maxretries)d retries. %(retry)d retry(ies) till now...' %locals()) 1075 retry+= 1 1076 time.sleep(retrydelay) 1077 1078 else: 1079 break 1080 1081 # Close previous response, connection 1082 if(httplib.OK!= httpstatus): 1083 # We do a response.read() later after we calculate total bytes to download; so we cannot close it here 1084 response.close(), httpcon.close() 1085 1086 # Check for response statuses, and take according action 1087 if(httplib.OK== httpstatus): 1088 cancel, ignore= False, False 1089 debug('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals()) 1090 1091 html= '' 1092 # httplib converts all header keys to lowercase 1093 contenttype= headers.get('content-type', '') 1094 doc.contenttype= contenttype.lower() 1095 1096 debug('Crawler.aftercrawl(): Checking content-type for url "%(uri)s"...' %locals()) 1097 1098 # Find if acceptable mime types are a part of Content-Type header 1099 availablemimes= [v for v in cc.allowedmimes if (-1!= contenttype.find(v))] 1100 1101 if(1== len(availablemimes)): 1102 debug('Crawler.aftercrawl(): Allowed content-type for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals()) 1103 1104 totalbytes= int(headers.get('content-length', maxcontentbytes)) 1105 bytestodownload= 0 1106 1107 debug('Crawler.aftercrawl(): Total %(totalbytes)d bytes to be downloaded for url "%(uri)s"...' %locals()) 1108 1109 if(0>= totalbytes): 1110 bytestodownload= maxcontentbytes 1111 1112 elif(1<= totalbytes<= maxcontentbytes): 1113 bytestodownload= totalbytes 1114 1115 elif((totalbytes> maxcontentbytes) and truncate): 1116 bytestotruncate= abs(totalbytes- maxcontentbytes) 1117 bytestodownload= maxcontentbytes 1118 warning('Crawler.aftercrawl(): Truncating %(bytestotruncate)d bytes from total download size of %(totalbytes)d bytes as it exceeds max. download size limit of %(maxcontentbytes)d bytes allowed to be downloaded for url "%(uri)s" (Check if "maxcontenttruncate" is "True" in crawler configuration)...' %locals()) 1119 1120 else: 1121 bytestodownload= 0 1122 warning('Crawler.aftercrawl(): Discarding as total download size of %(totalbytes)d bytes exceeds max. download size limit of %(maxcontentbytes)d bytes allowed to be downloaded for url "%(uri)s". Check if "maxcontenttruncate" is "False" in crawler configuration...' %locals()) 1123 1124 if(0== bytestodownload): 1125 response.close(), httpcon.close() 1126 1127 else: 1128 debug('Crawler.aftercrawl(): Downloading content from url "%(uri)s"...' %locals()) 1129 1130 debug('Crawler.aftercrawl(): Updating headers for document for url "%(uri)s"...' %locals()) 1131 doc.lastmodified= headers.get('last-modified', '') 1132 doc.etag= headers.get('etag', '') 1133 1134 debug('Crawler.aftercrawl(): Reading total %(bytestodownload)d bytes from url "%(uri)s"...' %locals()) 1135 # TODO: Better to read chunked? 1136 html= response.read(bytestodownload) 1137 response.close(), httpcon.close() 1138 1139 # Check if response is gzipped, to read unzipped contents 1140 contentencoding= headers.get('content-encoding', '') 1141 doc.contentencoding= contentencoding.lower() 1142 1143 if('gzip'== contentencoding.lower()): 1144 debug('Crawler.aftercrawl(): Contents are gzipped/compressed for url "%(uri)s". Unzipping/uncompressing contents...' %locals()) 1145 doc.zippedcontent= html 1146 1147 else: 1148 # Set plain contents; note they may be cleaned for newlines, extra spaces 1149 doc.plaincontent= html 1150 1151 # Retrieve cleaned html contents 1152 html= doc.plaincontent 1153 1154 else: 1155 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_INVALID_MIME) 1156 warning('Crawler.aftercrawl(): Non-allowed content-type for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals()) 1157 1158 # Convert html to utf-8 1159 if(0!= len(html)): 1160 if(kconv.UTF8!= kconv.ChkCoding(html)): 1161 debug('Crawler.aftercrawl(): Converting contents to utf-8 for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals()) 1162 1163 try: 1164 doc.plaincontent= self.kc.convert(html) 1165 html= doc.plaincontent 1166 1167 except: 1168 html= '' 1169 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_CONVERSION, sys.exc_info()[0], sys.exc_info()[1]) 1170 1171 exception('Crawler.aftercrawl(): Failed converting contents to utf-8 for url "%(uri)s"...' %locals()) 1172 1173 else: 1174 debug('Crawler.aftercrawl(): Contents already in utf-8 for url "%(uri)s". Content-type is "%(contenttype)s"...' %locals()) 1175 1176 else: 1177 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_UNHANDLED_HTTPSTATUS) 1178 critical('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals()) 1179 1180 # Extract parts from html if present 1181 if(0!= len(html)): 1182 # Begin parsing of document contents 1183 debug('Crawler.aftercrawl(): Parsing contents for url "%(uri)s"...' %locals()) 1184 tags= htmldata.tagextract(html) 1185 1186 extitlestart, extitleend, title, = False, False, '' 1187 description, keywords, metarobots= '', '', '' 1188 1189 # NOTE: htmldata module converts everything to lowercase 1190 for tag in tags: 1191 # Title; how about <TITLE /> tag i.e. no title present? 1192 if(isinstance(tag, tuple) and ('title'== tag[0].lower())): 1193 extitlestart= True 1194 extitleend= False 1195 1196 else: 1197 if(extitlestart and not extitleend and isinstance(tag, str)): 1198 title+= tag 1199 1200 if(isinstance(tag, tuple) and ('/title'== tag[0].lower())): 1201 extitlestart= True 1202 extitleend= True 1203 1204 # Meta description 1205 if(isinstance(tag, tuple) and ('meta/'== tag[0])): 1206 if('description'== tag[1].get('name', '').lower()): 1207 description= tag[1].get('content', '').lower() 1208 1209 if('keywords'== tag[1].get('name', '').lower()): 1210 keywords= tag[1].get('content', '').lower() 1211 1212 if('robots'== tag[1].get('name', '').lower()): 1213 metarobots= tag[1].get('content', '').lower() 1214 1215 noindex, nofollow = False, False 1216 1217 if(cc.obeymetarobots): 1218 debug('Crawler.aftercrawl(): Obeying meta robots for url "%(uri)s"...' %locals()) 1219 1220 # Not every page has meta robots 1221 if(0== len(metarobots)): 1222 debug('Crawler.aftercrawl(): No meta robots found for url "%(uri)s". Crawling normally...' %locals()) 1223 1224 else: 1225 debug('Crawler.aftercrawl(): Using meta robots "%(metarobots)s" found for url "%(uri)s"...' %locals()) 1226 noindex= (-1!= metarobots.find('noindex')) 1227 nofollow= (-1!= metarobots.find('nofollow')) 1228 1229 else: 1230 debug('Crawler.aftercrawl(): Ignoring meta robots for url "%(uri)s"...' %locals()) 1231 1232 debug('Crawler.aftercrawl(): Setting document attributes for url "%(uri)s"...' %locals()) 1233 1234 if(noindex): 1235 warning('Crawler.aftercrawl(): According to meta robots "noindex" is specified for url "%(uri)s". No data will be used from this url. However links may be crawled if "follow" is specified...' %locals()) 1236 1237 else: 1238 doc.title= title 1239 doc.description= description 1240 1241 if(nofollow): 1242 warning('Crawler.aftercrawl(): According to meta robots "nofollow" is specified for url "%(uri)s". No links will be used from this url. However, data may be used if "index" is specified...' %locals()) 1243 1244 else: 1245 cancel, ignore= False, False 1246 1247 debug('Crawler.aftercrawl(): Extracting all links from "%(uri)s"...' %locals()) 1248 1249 # Returns a list of htmldata.URLMatch objects. 1250 linkmatches= htmldata.urlextract(html, uri.url) 1251 links= [Uri(linkmatch.url) for linkmatch in linkmatches] 1252 1253 totallinks= len(links) 1254 debug('Crawler.aftercrawl(): Filtering %(totallinks)d links from "%(uri)s" having valid crawl extensions (see "allowedextns" in crawler config)...' %locals()) 1255 links= [link for link in links 1256 if os.path.splitext(link.parts.path)[1] in cc.allowedextns] 1257 1258 filterlinks1= len(links) 1259 debug('Crawler.aftercrawl(): Filtered %(filterlinks1)d links from "%(uri)s". Filtering further those satisfying requested crawl scope...' %locals()) 1260 1261 crawllinks= [] 1262 for link in links: 1263 if(CrawlScope.SCOPE_HOST== cc.crawlscope): 1264 if(uri.ishostscope(link)): 1265 if(link not in crawllinks): 1266 crawllinks.append(link) 1267 1268 elif(CrawlScope.SCOPE_DOMAIN== cc.crawlscope): 1269 if(uri.isdomainscope(link)): 1270 if(link not in crawllinks): 1271 crawllinks.append(link) 1272 1273 elif(CrawlScope.SCOPE_PATH== cc.crawlscope): 1274 if(uri.ispathscope(link)): 1275 if(link not in crawllinks): 1276 crawllinks.append(link) 1277 1278 else: 1279 cancel= True 1280 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_INVALID_CRAWLSCOPE) 1281 1282 critical('Crawler.aftercrawl(): Unknown crawl scope for url "%(uri)s". Please check crawler\'s crawls cope for valid scopes...' %locals()) 1283 break 1284 1285 if(cancel): 1286 pass 1287 1288 else: 1289 doc.links.extend(crawllinks) 1290 1291 filterlinks2= len(crawllinks) 1292 ignorelinks= abs(totallinks- filterlinks2) 1293 1294 debug('Crawler.aftercrawl(): Found %(filterlinks2)d links from "%(uri)s" to crawl...' %locals()) 1295 debug('Crawler.aftercrawl(): Total links= %(totallinks)d, Links to crawl= %(filterlinks2)d, Links ignored= %(ignorelinks)d from "%(uri)s"...' %locals()) 1296 1297 else: 1298 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_NOHTML) 1299 warning('Crawler.aftercrawl(): No html body available for url "%(uri)s"...' %locals()) 1300 1301 else: 1302 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_UNHANDLED_HTTPSTATUS) 1303 critical('Crawler.aftercrawl(): %(httpstatus)d %(httpreason)s for url "%(uri)s"...' %locals()) 1304 1305 # Document object will be used by caller to determine httpstatus, reason and headers from GET request; we set them here 1306 doc.httpstatus, doc.httpreason= httpstatus, httpreason 1307 doc.headers.clear(), doc.headers.update(headers) 1308 1309 if(cancel): 1310 pass 1311 1312 else: 1313 debug('Crawler.aftercrawl(): Firing events after crawling of url "%(uri)s" at level %(level)d...' %locals()) 1314 cancel, ignore= False, False 1315 cancel, ignore= self.firevents(self.callbacks['aftercrawl'], Crawler.CrawlEventArgs(level, doc, None)) 1316 1317 debug('Crawler.aftercrawl(): Completed. Returning (httpstatus= %(httpstatus)d, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals()) 1318 return (cancel, ignore)

1319

1320 - class EventArgs(object):

1321 ''' 1322 Ruya's L{Crawler} provides event-based callback mechanism during crawl to allow clients to have more control over which urls are crawled. 1323 The events use this object for event communication. 1324 1325 Example:: 1326 # Client side event handler 1327 def beforecrawl(caller, eventargs): 1328 # Some process 1329 # ... 1330 1331 # Url is already crawled before (might be determined based on a simple dictionary caching mechanism) 1332 eventargs.ignore= False # Request Ruya to ignore this url during crawl 1333 1334 # ... 1335 1336 def aftercrawl(caller, eventargs): 1337 # Some process 1338 # ... 1339 1340 # Some error occurred during saving crawled data (might be a file or database), abort further crawling 1341 eventargs.cancel= True # Cancel crawling completely 1342 1343 # ... 1344 1345 @see: L{Crawler.bind} 1346 ''' 1347

1348 - def __init__(self, level= 0, args= []):

1349 ''' 1350 Constructor. 1351 1352 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}. 1353 @param level: Crawl level on which the event was raised. 1354 @type args: U{list<http://docs.python.org/lib/typesseq.html>} 1355 @param args: Additional arguments to be passed back to event handler. 1356 1357 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1358 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1359 ''' 1360 self.level= level #: Crawl level on which the event was raised. 1361 self.args= args #: Additional arguments to be passed back to event handler. 1362 self.cancel= False #: Flag that can be set to True by clients to B{cancel} entire crawl process - B{Use with caution}. 1363 self.ignore= False #: Flag that can be set to True by clients to B{ignore} a url from crawling, and continue next url.

1364

1365 - class CrawlEventArgs(EventArgs):

1366 ''' 1367 Ruya's L{Crawler} provides event-based callback mechanism during crawl to allow clients to have more control over which urls are crawled. 1368 The events use this object for event communication for B{beforecrawl}, and B{aftercrawl} events. 1369 ''' 1370

1371 - def __init__(self, level= 0, document= None, args= []):

1372 ''' 1373 Constructor. 1374 1375 @type document: L{Document}. 1376 @param document: L{Document} object used during crawl 1377 @type args: U{list<http://docs.python.org/lib/typesseq.html>} 1378 @param args: Additional arguments to be passed back to event handler. 1379 1380 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1381 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1382 ''' 1383 Crawler.EventArgs.__init__(self, level, args) 1384 self.document= document #: L{Document} object used during crawl

1385

1386 - class UriIncludeEventArgs(EventArgs):

1387 ''' 1388 Ruya's L{Crawler} provides event-based callback mechanism during crawl to allow clients to have more control over which urls are crawled. 1389 The events use this object for event communication for B{includelink} event. 1390 ''' 1391

1392 - def __init__(self, level= 0, uri= None, args= []):

1393 ''' 1394 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}. 1395 @param level: Crawl level on which the event was raised. 1396 @type uri: L{Uri} 1397 @param uri: The L{Uri} to include in crawl. 1398 @type args: U{list<http://docs.python.org/lib/typesseq.html>} 1399 @param args: Additional arguments to be passed back to event handler. 1400 1401 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1402 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1403 ''' 1404 Crawler.EventArgs.__init__(self, level, args) 1405 self.uri= uri #: The L{Uri} to include in crawl.

1406

1407 - def crawl(self, document, level= 0):

1408 ''' 1409 The main method where actual crawling is performed. 1410 1411 @type document: L{Document}. 1412 @param document: The L{Document} to crawl. 1413 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}. 1414 @param level: The level on which the L{Document} is crawled. 1415 1416 @todo: URL canonicalization http://www.archive.org/index.html and http://www.archive.org/ are same 1417 @todo: Tidy? 1418 @todo: Avoiding slow links? Currently handled by timeout from U{httplib<http://docs.python.org/lib/module-httplib.html>}. 1419 @todo: Support Crawl-Delay from robots.txt?. 1420 @todo: Detecting "soft" 404? http://blog.dtiblog.com/community-hobby.html => http://blog.dtiblog.com/404.html 1421 1422 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>} 1423 @return: (cancel, ignore) values set either internally or explicity by event handlers. 1424 ''' 1425 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig 1426 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception 1427 uri= document.uri 1428 1429 debug('Crawler.crawl(): Started...' %locals()) 1430 1431 try: 1432 cancel, ignore= False, False 1433 cancel, ignore= self.beforecrawl(document, level) 1434 1435 if(cancel or ignore): 1436 # Crawling was cancelled or requested to be ignored; nothing to do 1437 pass 1438 1439 else: 1440 cancel, ignore= False, False 1441 cancel, ignore= self.aftercrawl(document, level) 1442 1443 except: 1444 document.error= Document.DocumentError(Document.DocumentError.DOCERR_INTERNAL, sys.exc_info()[0], sys.exc_info()[1]) 1445 exception('Crawler.crawl(): Failed crawling url "%(uri)s"...' %locals()) 1446 1447 debug('Crawler.crawl(): Completed. Returning (cancel= %(cancel)s, ignore= %(ignore)s)...' %locals()) 1448 return (cancel, ignore)

1449

1450 -class SingleDomainDelayCrawler(Crawler):

1451 ''' 1452 Ruya's single domain delayed crawler is an enhancement to Ruya's base L{crawler<Crawler>}. 1453 B{This is a U{breadth-first<http://en.wikipedia.org/wiki/Breadth-first_search>} crawler with delay between each crawl request}. 1454 '''

1455 - def __init__(self, config):

1456 ''' 1457 Constructor. 1458 1459 @type config: L{Config}. 1460 @param config: The L{configuration<Config>} object to be used while crawling. 1461 1462 @rtype: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1463 @returns: U{None<http://docs.python.org/lib/bltin-null-object.html>} 1464 ''' 1465 Crawler.__init__(self, config)

1466

1467 - def crawl(self, document, level= 0):

1468 ''' 1469 The main method where actual crawling is performed. 1470 1471 @type document: L{Document}. 1472 @param document: The L{Document} to crawl. 1473 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}. 1474 @param level: The level on which the L{Document} is crawled. 1475 1476 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>} 1477 @return: (cancel, ignore) values set either internally or explicity by event handlers. 1478 ''' 1479 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig 1480 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception 1481 1482 debug('SingleDomainDelayCrawler.crawl(): Started...' %locals()) 1483 1484 doc, maxlevels= document, cc.levels 1485 starturi, cancel, ignore= doc.uri, False, False 1486 1487 try: 1488 if(0<= level<= maxlevels): 1489 debug('SingleDomainDelayCrawler.crawl(): Starting to crawl url "%(starturi)s" on level %(level)d upto max. %(maxlevels)d level(s)...' %locals()) 1490 nextleveldocs, cancel, ignore= self.crawlbreadth(level, maxlevels, starturi, [doc]) 1491 totaldocs= len(nextleveldocs) 1492 1493 # Start breadth-first crawl 1494 if(cancel or (0>= totaldocs)): 1495 # Either cancelled, or no documents to crawl 1496 pass 1497 1498 else: 1499 for level in range(1, maxlevels+1, 1): 1500 debug('SingleDomainDelayCrawler.crawl(): Starting to crawl url "%(starturi)s" on level %(level)d upto max. %(maxlevels)d level(s)...' %locals()) 1501 if(cancel): 1502 critical('SingleDomainDelayCrawler.crawl(): Crawling cancelled at level %(level)d as one of the event-handlers requested an abort' %locals()) 1503 break 1504 1505 if(ignore): 1506 # Nothing to be done as crawlbreadth already uses ignore 1507 pass 1508 1509 nextleveldocs, cancel, ignore= self.crawlbreadth(level, maxlevels, starturi, nextleveldocs) 1510 1511 else: 1512 document.error= Document.DocumentError(Document.DocumentError.DOCERR_MAXREDIRECT) 1513 debug('SingleDomainDelayCrawler.crawl(): Cannot crawl url "%(starturi)s" on level %(level)d as it exceeds max. levels %(maxlevels)d...' %locals()) 1514 1515 except: 1516 document.error= Document.DocumentError(Document.DocumentError.DOCERR_INTERNAL, sys.exc_info()[0], sys.exc_info()[1]) 1517 exception('SingleDomainDelayCrawler.crawl(): Failed crawling url "%(starturi)s"...' %locals()) 1518 1519 debug('SingleDomainDelayCrawler.crawl(): Completed crawling url "%(starturi)s" till max. %(maxlevels)d level(s)' %locals()) 1520 return (cancel, ignore)

1521

1522 - def crawlbreadth(self, level, maxlevels, domainuri, documents):

1523 ''' 1524 The main method where actual breadth-first crawling is performed. 1525 1526 @type level: U{number<http://docs.python.org/lib/typesnumeric.html>}. 1527 @param level: The level on which the L{Document} is crawled. 1528 @type maxlevels: U{number<http://docs.python.org/lib/typesnumeric.html>}. 1529 @param maxlevels: L{Maximum number<Config.CrawlConfig.levels>} of levels to crawl. 1530 @type domainuri: L{Uri}. 1531 @param domainuri: Valid instance of L{Uri} object. 1532 @type documents: U{list<http://docs.python.org/lib/typesseq.html>} 1533 @param documents: Documents list to which newly to-be-crawled urls are appended for later crawling. 1534 1535 @attention: Event L{includelink<Crawler.firevents>} is not fired for first L{Uri} where crawl is started, however L{beforecrawl<Crawler.firevents>} event might be fired if url is redirected. 1536 @rtype: U{tuple<http://docs.python.org/lib/typesseq.html>} 1537 @return: (nextleveldocs, cancel, ignore) values set either internally or explicity by event handlers. 1538 ''' 1539 # TODO: Fire includelink at level 0? Does it make sense? 1540 cc, rc, lc= self.config.crawlconfig, self.config.redirectconfig, self.config.logconfig 1541 log, debug, info, warning, error, critical, exception= lc.log, lc.debug, lc.info, lc.warning, lc.error, lc.critical, lc.exception 1542 1543 debug('SingleDomainDelayCrawler.crawlbreadth(): Started...' %locals()) 1544 1545 nextleveldocs, nextlevel, cancel, delay, cancel, ignore= [], level+ 1, False, cc.crawldelay, False, False 1546 totaldocs= len(documents) 1547 1548 for docindex in range(0, totaldocs, 1): 1549 cancel, ignore= False, False 1550 doc= documents[docindex] 1551 uri= doc.uri 1552 1553 try: 1554 debug('SingleDomainDelayCrawler.crawlbreadth(): Crawling url "%(uri)s" at level %(level)d...' %locals()) 1555 cancel, ignore= Crawler.crawl(self, doc, level) 1556 1557 if(cancel): 1558 critical('SingleDomainDelayCrawler.crawlbreadth(): Crawling cancelled at url "%(uri)s" level %(level)d as one of the event-handlers requested an abort' %locals()) 1559 break 1560 1561 if(ignore): 1562 debug('SingleDomainDelayCrawler.crawlbreadth(): Not using further links from url "%(uri)s" level %(level)d as one of the event-handlers requested to ignore' %locals()) 1563 continue 1564 1565 cancel, ignore= False, False 1566 1567 # If this url is not at max. level, schedule links from the url for next level crawl 1568 if(1<= nextlevel<= maxlevels): 1569 links, ulinks= doc.links, [] 1570 1571 for link in links: 1572 cancel, ignore= False, False 1573 1574 debug('SingleDomainDelayCrawler.crawlbreadth(): Firing events before including url "%(link)s" to be crawled at next level %(nextlevel)d...' %locals()) 1575 cancel, ignore= self.firevents(self.callbacks['includelink'], SingleDomainDelayCrawler.UriIncludeEventArgs(nextlevel, link, None)) 1576 1577 if(cancel): 1578 critical('SingleDomainDelayCrawler.crawlbreadth(): Include link cancelled at url "%(uri)s" level %(level)d as one of the event-handlers requested an abort' %locals()) 1579 1580 # cancel state needs to be maintained so that callers, outer loops also use this status 1581 # cancel= False 1582 1583 break 1584 1585 if(ignore): 1586 debug('SingleDomainDelayCrawler.crawlbreadth(): Not including url "%(link)s" for crawling as one of the event-handlers requests not to include it for crawl at next level %(nextlevel)d...' %locals()) 1587 1588 # We have ignored including this link; reset 1589 ignore= False 1590 1591 continue 1592 1593 ulinks.extend([link]) 1594 1595 if(cancel): 1596 pass 1597 1598 else: 1599 totallinks= len(ulinks) 1600 debug('SingleDomainDelayCrawler.crawlbreadth(): Scheduling %(totallinks)d links from url "%(uri)s" at level %(level)d to be crawled at next level %(nextlevel)d...' %locals()) 1601 nextleveldocs.extend([Document(ulink) for ulink in ulinks]) 1602 1603 else: 1604 debug('SingleDomainDelayCrawler.crawlbreadth(): Not scheduling links for crawling from url "%(uri)s" as next level %(nextlevel)d from current level %(level)d will be already over max. %(maxlevels)d level(s)...' %locals()) 1605 1606 # Innermost for-loops can signal a cancel, so we need to break till outer for-loop 1607 if(cancel): break 1608 1609 if((-1+ totaldocs)!= docindex): 1610 debug('SingleDomainDelayCrawler.crawlbreadth(): Pausing for %(delay)s seconds after crawling url "%(uri)s" at level %(level)d...' %locals()) 1611 time.sleep(delay) 1612 1613 except: 1614 doc.error= Document.DocumentError(Document.DocumentError.DOCERR_INTERNAL, sys.exc_info()[0], sys.exc_info()[1]) 1615 exception('SingleDomainDelayCrawler.crawlbreadth(): Failed crawling url "%(uri)s"...' %locals()) 1616 break 1617 1618 debug('SingleDomainDelayCrawler.crawlbreadth(): Completed. Returning (nextleveldocs, cancel= %(cancel)s, ignore= %(ignore)s)...' %locals()) 1619 return (nextleveldocs, cancel, ignore)

1620

Source Code for Module ruya