\d$dZddlZddlZddlZdgZejddZGddZGddZ Gd d Z dS) a% robotparser.py Copyright (C) 2000 Bastian Kleineidam You can choose between two licenses when using this package: 1) GNU GPLv2 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/norobots-rfc.txt NRobotFileParser RequestRatezrequests secondsc\eZdZdZddZdZdZdZdZdZ d Z d Z d Z d Z d ZdZdS)rzs This class provides a set of methods to read, parse and answer questions about a single robots.txt file. cg|_g|_d|_d|_d|_||d|_dS)NFr)entriessitemaps default_entry disallow_all allow_allset_url last_checkedselfurls #..\python\lib\urllib\robotparser.py__init__zRobotFileParser.__init__sG  !! Sc|jS)zReturns the time the robots.txt file was last fetched. This is useful for long-running web spiders that need to check for new robots.txt files periodically. )rrs rmtimezRobotFileParser.mtime%s   rc@ddl}||_dS)zYSets the time the robots.txt file was last fetched to the current time. rN)timer)rrs rmodifiedzRobotFileParser.modified.s#   IIKKrc|||_tj|dd\|_|_dS)z,Sets the URL referring to a robots.txt file.N)rurllibparseurlparsehostpathrs rr zRobotFileParser.set_url6s4%|44S99!A#> 4999rc tj|j}|}||ddS#tjj $rK}|j dvrd|_ n)|j dkr|j dkrd|_ Yd}~dSYd}~dSYd}~dSYd}~dSd}~wwxYw)z4Reads the robots.txt URL and feeds it to the parser.zutf-8)iiTiiN) rrequesturlopenrreadrdecode splitlineserror HTTPErrorcoder r )rfrawerrs rr&zRobotFileParser.read;s 9&&tx00A&&((C JJszz'**5577 8 8 8 8 8|% & & &x:%%$(!!SSX^^!%!"!!!!!%3^^^^^ &s$A66C .CCcpd|jvr|j ||_dSdS|j|dSN*) useragentsr rappend)rentrys r _add_entryzRobotFileParser._add_entryHsM %" " "!)%*"""*) L   & & & & &rc<d}t}||D]V}|sB|dkrt}d}n+|dkr%||t}d}|d}|dkr |d|}|}|s|dd}t |dkr|d|d<tj |d|d<|ddkrM|dkr#||t}|j |dd}o|ddkr8|dkr0|j t|dd d}|dd kr8|dkr0|j t|dd d}|dd krP|dkrH|drt!|d|_d}S|dd kr|dkr|dd}t |dkr|drg|dr;t%t!|dt!|d|_d}*|ddkr |j |dX|dkr||dSdS)zParse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines. rr#N:z user-agentdisallowFallowTz crawl-delayz request-rate/sitemap)Entryrr5findstripsplitlenlowerrrunquoter2r3 rulelinesRuleLineisdigitintdelayrreq_rater )rlinesstater4lineinumberss rrzRobotFileParser.parseQsJ 7 27 2D A::!GGEEEaZZOOE***!GGEE #AAvvBQBx::< 4   5\**6<+?+?+D+DEE l%%r"Z_  j. 0C'EFFl  %% C\ , ,E ** ,s+++++ ,   5%//44 4trc|sdS|jD] }||r |jcS!|jr |jjSdSN)rrrVrIr rrXr4s r crawl_delayzRobotFileParser.crawl_delaysmzz|| 4\ # #E ** #{""" #   ,%+ +trc|sdS|jD] }||r |jcS!|jr |jjSdSr\)rrrVrJr r]s r request_ratezRobotFileParser.request_ratesmzz|| 4\ & &E ** &~%%% &   /%. .trc"|jsdS|jSr\)r rs r site_mapszRobotFileParser.site_mapss} 4}rc|j}|j ||jgz}dtt|S)Nz )rr joinmapstr)rrs r__str__zRobotFileParser.__str__s>,   )!3 44G{{3sG,,---rN)r)__name__ __module__ __qualname____doc__rrrr r&r5rrZr^r`rbrgrrrrs !!!(((??? 9 9 9'''G#G#G#R: .....rc$eZdZdZdZdZdZdS)rFzoA rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.c|dkr|sd}tjtj|}tj||_||_dS)NrT)rrrQr rUr"rW)rr"rWs rrzRuleLine.__init__s[ 2::i:I|&&v|'<'9zTADIMMrN)rhrirjrkrrVrgrlrrrFrFsS11###BBBNNNNNrrFc*eZdZdZdZdZdZdZdS)r>z?An entry has one or more user-agents and zero or more rulelinesc>g|_g|_d|_d|_dSr\)r2rErIrJrs rrzEntry.__init__s"  rc|g}|jD]}|d||j|d|j|j,|j}|d|jd|j|tt|j d |S)Nz User-agent: z Crawl-delay: zRequest-rate: r< ) r2r3rIrJrequestssecondsextendrerfrErd)rretagentrates rrgz Entry.__str__s_ / /E JJ-e-- . . . . : ! JJ3tz33 4 4 4 = $=D JJF FF FF G G G 3sDN++,,,yy~~rc|dd}|jD]&}|dkrdS|}||vrdS'dS)z2check if this entry applies to the specified agentr<rr1TF)rArCr2)rrXr}s rrVzEntry.applies_tospOOC((+1133 _  E||ttKKMME !!tt"urcV|jD] }||r |jcS!dS)zZPreconditions: - our agent applies to this entry - filename is URL decodedT)rErVrW)rrqrMs rrWzEntry.allowance sAN & &Dx(( &~%%% &trN)rhrirjrkrrgrVrWrlrrr>r>sVII      rr>) rk collections urllib.parserurllib.request__all__ namedtuplerrrFr>rlrrrs    $k$]4FGG ~.~.~.~.~.~.~.~.BNNNNNNNN$((((((((((r