\dBdZddlZddlZddlmZdgZejdZejdZejdZ ejdZ ejd Z ejd Z ejd Z ejd Zejd ZejdejZejd ZejdZGddejZdS)zA parser for HTML and XHTML.N)unescape HTMLParserz[&<]z &[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z <[a-zA-Z]>z--\s*>z+([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) \s* # possibly followed by a space )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace z#ceZdZdZdZdddZdZdZdZd Z d Z d Z d Z d Z dZddZdZdZdZdZdZdZdZdZdZdZdZdZdZdZd S) raEFind tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument. )scriptstyleT)convert_charrefsc<||_|dS)zInitialize and reset this instance. If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. N)r reset)selfr s ..\python\lib\html\parser.py__init__zHTMLParser.__init__Vs !1 cd|_d|_t|_d|_t j|dS)z1Reset this instance. Loses all unprocessed data.z???N)rawdatalasttaginteresting_normal interesting cdata_elem _markupbase ParserBaser r s r r zHTMLParser.reset_s<  -$$T*****rcN|j|z|_|ddS)zFeed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). rN)rgoaheadr datas r feedzHTMLParser.feedgs% |d*  Qrc0|ddS)zHandle any buffered data.N)rrs r closezHTMLParser.closeps QrNc|jS)z)Return full source of start tag: '<...>'.)_HTMLParser__starttag_textrs r get_starttag_textzHTMLParser.get_starttag_textvs ##rc||_tjd|jztj|_dS)Nz )lowerrrecompileIr)r elems r set_cdata_modezHTMLParser.set_cdata_modezs4**,,:nt&FMMrc,t|_d|_dSN)rrrrs r clear_cdata_modezHTMLParser.clear_cdata_mode~s-rc |j}d}t|}||kr#|jr}|jsv|d|}|dkrY|dt ||dz }|dkr*tjd ||sn|}n=|j ||}|r| }n |jrnd|}||krV|jr2|js+| t|||n| ||||||}||krn|j}|d|rt ||r||} n|d|r||} n|d|r||} nj|d|r||} nH|d |r||} n&|d z|kr| d|d z} nn| dkr|sn|d |d z} | dkr%|d|d z} | dkr|d z} n| d z } |jr2|js+| t||| n| ||| ||| }n-|d |rt.||}|rq|d d} || |} |d| d z s| d z } ||| }d||dvr9| |||d z|||d z}nS|d|r5t6||}|rj|d } || |} |d| d z s| d z } ||| }kt:||}|rX|rU|||dkr5|} | |kr|} |||d z}nJ|d z|kr/| d|||d z}n nJd||k#|ry||krs|jsl|jr2|js+| t|||n| ||||||}||d|_dS)Nr<&"z[\s;]>D))$/// A%:c1Q3//"Eq!,,A"((!445u{{}} ;;!IIKK66 !A NN1a!e44!eq[[$$S)))q!a%00AA55555S!eeV  %1q555$ /T_ /  '!A#,!7!78888  1...q!$$Aqrr{ rc|j}|||dzdks Jd|||dzdkr||S|||dzdkr||S|||dzd krF|d |dz}|d krd S|||dz||d zS||S) Nr7r6z+unexpected call to parse_html_declaration()r4zV # #%%a(( ( QqsU^u $ $,,Q// / QqsU^ ! ! # #{ 2 2LLac**E{{r   WQqSY/ 0 0 07N++A.. .rr c|j}|||dzdvs Jd|d|dz}|dkrdS|r |||dz||dzS)Nr7)r6r3z"unexpected call to parse_comment()rr8r )rr;handle_comment)r rQreportrposs r r]zHTMLParser.parse_bogus_comments,q1u~---1B---ll3!$$ "992  2   !C 0 1 1 1Qwrc|j}|||dzdks Jdt||dz}|sdS|}|||dz||}|S)Nr7r5zunexpected call to parse_pi()r8)rpicloser>r? handle_pirM)r rQrrDrSs r rHzHTMLParser.parse_pi s,q1u~%%%'F%%%w!,, 2 KKMM wqsAv''' IIKKrcVd|_||}|dkr|S|j}||||_g}t||dz}|s Jd|}|dx|_}||krt||}|sn|ddd\} } } | sd} nI| dddcxkr| ddks"n| dddcxkr| ddkr nn | dd} | rt| } | | | f|}||k||| } | d vr| ||||S| d r|||n4|||||jvr|||S) Nrr z#unexpected call to parse_starttag()r7rY'r8")r/>ri)r#check_for_whole_start_tagrtagfind_tolerantrDrMrKr&rattrfind_tolerantrappendstripr@endswithhandle_startendtaghandle_starttagCDATA_CONTENT_ELEMENTSr+) r rQendposrattrsrDrUtagmattrnamerest attrvaluerMs r rEzHTMLParser.parse_starttag,sj#//22 A::M,&qx0 &&w!44;;;;;; IIKK"[[^^11333 s&jj!''33A ()1a(8(8 %HdI , 2A2$8888)BCC.88882A2#777723377777%adO  0$Y// LL(..**I6 7 7 7A&jjah%%'' k ! !   WQvX. / / /M <<   )  # #C / / / /  e , , ,d111##C((( rc|j}t||}|r|}|||dz}|dkr|dzS|dkr@|d|r|dzS|d|rdS||kr|S|dzS|dkrdS|dvrdS||kr|S|dzSt d ) Nr r/rir7r8rz6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!)rlocatestarttagend_tolerantrDrMrBAssertionError)r rQrrvrSnexts r rjz$HTMLParser.check_for_whole_start_tagXs, & , ,Wa 8 8  A1QqS5>Ds{{1u s{{%%dA..!q5L%%c1--2q55Hq5Lrzzr566r1uu1u 6777rc|j}|||dzdks Jdt||dz}|sdS|}t||}|s|j|||||St||dz}|s+|||dzdkr|dzS| |S| d }| d|}| ||dzS| d }|j*||jkr|||||S| |||S) Nr7r3zunexpected call to parse_endtagr r8rYzr)r endendtagr>rM endtagfindrDrr@rkr]rKr&r; handle_endtagr.)r rQrrDr^ namematchtagnamer*s r rFzHTMLParser.parse_endtagzs,q1u~%%%'H%%%  !A#.. 2   !,, *  5!1222 (..w!<U**Q3J33A666ooa((..00G LLimmoo66E   w ' ' '7N{{1~~##%% ? &t&&  5!1222  4     rc\|||||dSr-)rqrr rurts r rpzHTMLParser.handle_startendtags2 S%((( 3rcdSr-rs r rqzHTMLParser.handle_starttag rcdSr-r)r rus r rzHTMLParser.handle_endtagrrcdSr-rr rVs r rLzHTMLParser.handle_charrefrrcdSr-rrs r rOzHTMLParser.handle_entityrefrrcdSr-rrs r r@zHTMLParser.handle_datarrcdSr-rrs r r`zHTMLParser.handle_commentrrcdSr-r)r decls r r\zHTMLParser.handle_declrrcdSr-rrs r rezHTMLParser.handle_pirrcdSr-rrs r unknown_declzHTMLParser.unknown_declrr)r )__name__ __module__ __qualname____doc__rrrr rr!r#r$r+r.rrIr]rHrErjrFrprqrrLrOr@r`r\rerrrr rr>s*1+/+++O$$$NNNu#u#u#t///*       (((X888D%%%P                                r)rr'rhtmlr__all__r(rrPrNrJrCrd commentcloserkrlVERBOSEr|rrrrrrr rsm""  . RZ'' RZ % % BJ> ? ? "*@ A Arz+&& "*S//rz)$$ 2:LMMBJ=>>(RZ)Z BJsOO RZ> ? ? I I I I I 'I I I I I r