File | /project/perl/lib/HTML/Parser.pm |
Statements Executed | 9236 |
Statement Execution Time | 179ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
461 | 1 | 1 | 97.4ms | 142ms | init | HTML::Parser::
2766 | 6 | 2 | 25.5ms | 25.5ms | handler (xsub) | HTML::Parser::
461 | 1 | 1 | 18.0ms | 160ms | new | HTML::Parser::
461 | 1 | 2 | 7.42ms | 7.42ms | _alloc_pstate (xsub) | HTML::Parser::
922 | 1 | 2 | 6.65ms | 6.65ms | CORE:match (opcode) | HTML::Parser::
461 | 1 | 2 | 5.15ms | 5.15ms | ignore_elements (xsub) | HTML::Parser::
7 | 1 | 2 | 283µs | 986µs | parse (xsub) | HTML::Parser::
7 | 1 | 2 | 48µs | 48µs | eof (xsub) | HTML::Parser::
0 | 0 | 0 | 0s | 0s | BEGIN | HTML::Parser::
0 | 0 | 0 | 0s | 0s | __ANON__[:54] | HTML::Parser::
0 | 0 | 0 | 0s | 0s | __ANON__[:60] | HTML::Parser::
0 | 0 | 0 | 0s | 0s | netscape_buggy_comment | HTML::Parser::
0 | 0 | 0 | 0s | 0s | parse_file | HTML::Parser::
0 | 0 | 0 | 0s | 0s | text | HTML::Parser::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package HTML::Parser; | ||||
2 | |||||
3 | # Copyright 1996-2007, Gisle Aas. | ||||
4 | # Copyright 1999-2000, Michael A. Chase. | ||||
5 | # | ||||
6 | # This library is free software; you can redistribute it and/or | ||||
7 | # modify it under the same terms as Perl itself. | ||||
8 | |||||
9 | 3 | 107µs | 1 | 27µs | use strict; # spent 27µs making 1 call to strict::import |
10 | 3 | 1.41ms | 1 | 240µs | use vars qw($VERSION @ISA); # spent 240µs making 1 call to vars::import |
11 | |||||
12 | 1 | 6µs | $VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $ | ||
13 | |||||
14 | 1 | 6µs | require HTML::Entities; | ||
15 | |||||
16 | 1 | 4µs | require XSLoader; | ||
17 | 1 | 16.9ms | 1 | 16.8ms | XSLoader::load('HTML::Parser', $VERSION); # spent 16.8ms making 1 call to XSLoader::load |
18 | |||||
19 | sub new | ||||
20 | # spent 160ms (18.0+142) within HTML::Parser::new which was called 461 times, avg 347µs/call:
# 461 times (18.0ms+142ms) by HTML::HeadParser::new at line 103 of HTML/HeadParser.pm, avg 347µs/call | ||||
21 | 461 | 2.64ms | my $class = shift; | ||
22 | 461 | 3.64ms | my $self = bless {}, $class; | ||
23 | 461 | 11.2ms | 461 | 142ms | return $self->init(@_); # spent 142ms making 461 calls to HTML::Parser::init, avg 308µs/call |
24 | } | ||||
25 | |||||
26 | |||||
27 | sub init | ||||
28 | # spent 142ms (97.4+44.7) within HTML::Parser::init which was called 461 times, avg 308µs/call:
# 461 times (97.4ms+44.7ms) by HTML::Parser::new at line 23, avg 308µs/call | ||||
29 | 461 | 2.18ms | my $self = shift; | ||
30 | 461 | 14.8ms | 461 | 7.42ms | $self->_alloc_pstate; # spent 7.42ms making 461 calls to HTML::Parser::_alloc_pstate, avg 16µs/call |
31 | |||||
32 | 461 | 4.63ms | my %arg = @_; | ||
33 | 461 | 3.42ms | my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); | ||
34 | 461 | 2.29ms | if ($api_version >= 4) { | ||
35 | require Carp; | ||||
36 | Carp::croak("API version $api_version not supported " . | ||||
37 | "by HTML::Parser $VERSION"); | ||||
38 | } | ||||
39 | |||||
40 | 461 | 4.52ms | if ($api_version < 3) { | ||
41 | # Set up method callbacks compatible with HTML-Parser-2.xx | ||||
42 | 461 | 12.6ms | 461 | 5.89ms | $self->handler(text => "text", "self,text,is_cdata"); # spent 5.89ms making 461 calls to HTML::Parser::handler, avg 13µs/call |
43 | 461 | 10.3ms | 461 | 4.13ms | $self->handler(end => "end", "self,tagname,text"); # spent 4.13ms making 461 calls to HTML::Parser::handler, avg 9µs/call |
44 | 461 | 9.77ms | 461 | 3.98ms | $self->handler(process => "process", "self,token0,text"); # spent 3.98ms making 461 calls to HTML::Parser::handler, avg 9µs/call |
45 | 461 | 10.3ms | 461 | 4.47ms | $self->handler(start => "start", # spent 4.47ms making 461 calls to HTML::Parser::handler, avg 10µs/call |
46 | "self,tagname,attr,attrseq,text"); | ||||
47 | |||||
48 | $self->handler(comment => | ||||
49 | sub { | ||||
50 | my($self, $tokens) = @_; | ||||
51 | for (@$tokens) { | ||||
52 | $self->comment($_); | ||||
53 | } | ||||
54 | 461 | 10.0ms | 461 | 3.52ms | }, "self,tokens"); # spent 3.52ms making 461 calls to HTML::Parser::handler, avg 8µs/call |
55 | |||||
56 | $self->handler(declaration => | ||||
57 | sub { | ||||
58 | my $self = shift; | ||||
59 | $self->declaration(substr($_[0], 2, -1)); | ||||
60 | 461 | 9.92ms | 461 | 3.48ms | }, "self,text"); # spent 3.48ms making 461 calls to HTML::Parser::handler, avg 8µs/call |
61 | } | ||||
62 | |||||
63 | 461 | 2.61ms | if (my $h = delete $arg{handlers}) { | ||
64 | $h = {@$h} if ref($h) eq "ARRAY"; | ||||
65 | while (my($event, $cb) = each %$h) { | ||||
66 | $self->handler($event => @$cb); | ||||
67 | } | ||||
68 | } | ||||
69 | |||||
70 | # In the end we try to assume plain attribute or handler | ||||
71 | 461 | 6.01ms | while (my($option, $val) = each %arg) { | ||
72 | 461 | 20.5ms | 922 | 6.65ms | if ($option =~ /^(\w+)_h$/) { # spent 6.65ms making 922 calls to HTML::Parser::CORE:match, avg 7µs/call |
73 | $self->handler($1 => @$val); | ||||
74 | } | ||||
75 | elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { | ||||
76 | require Carp; | ||||
77 | Carp::croak("Bad constructor option '$option'"); | ||||
78 | } | ||||
79 | else { | ||||
80 | 461 | 12.2ms | 461 | 5.15ms | $self->$option($val); # spent 5.15ms making 461 calls to HTML::Parser::ignore_elements, avg 11µs/call |
81 | } | ||||
82 | } | ||||
83 | |||||
84 | 461 | 6.59ms | return $self; | ||
85 | } | ||||
86 | |||||
87 | |||||
88 | sub parse_file | ||||
89 | { | ||||
90 | my($self, $file) = @_; | ||||
91 | my $opened; | ||||
92 | if (!ref($file) && ref(\$file) ne "GLOB") { | ||||
93 | # Assume $file is a filename | ||||
94 | local(*F); | ||||
95 | open(F, $file) || return undef; | ||||
96 | binmode(F); # should we? good for byte counts | ||||
97 | $opened++; | ||||
98 | $file = *F; | ||||
99 | } | ||||
100 | my $chunk = ''; | ||||
101 | while (read($file, $chunk, 512)) { | ||||
102 | $self->parse($chunk) || last; | ||||
103 | } | ||||
104 | close($file) if $opened; | ||||
105 | $self->eof; | ||||
106 | } | ||||
107 | |||||
108 | |||||
109 | sub netscape_buggy_comment # legacy | ||||
110 | { | ||||
111 | my $self = shift; | ||||
112 | require Carp; | ||||
113 | Carp::carp("netscape_buggy_comment() is deprecated. " . | ||||
114 | "Please use the strict_comment() method instead"); | ||||
115 | my $old = !$self->strict_comment; | ||||
116 | $self->strict_comment(!shift) if @_; | ||||
117 | return $old; | ||||
118 | } | ||||
119 | |||||
120 | # set up method stubs | ||||
121 | sub text { } | ||||
122 | 1 | 9µs | *start = \&text; | ||
123 | 1 | 5µs | *end = \&text; | ||
124 | 1 | 6µs | *comment = \&text; | ||
125 | 1 | 4µs | *declaration = \&text; | ||
126 | 1 | 5µs | *process = \&text; | ||
127 | |||||
128 | 1 | 26µs | 1; | ||
129 | |||||
130 | __END__ | ||||
131 | |||||
132 | |||||
133 | =head1 NAME | ||||
134 | |||||
135 | HTML::Parser - HTML parser class | ||||
136 | |||||
137 | =head1 SYNOPSIS | ||||
138 | |||||
139 | use HTML::Parser (); | ||||
140 | |||||
141 | # Create parser object | ||||
142 | $p = HTML::Parser->new( api_version => 3, | ||||
143 | start_h => [\&start, "tagname, attr"], | ||||
144 | end_h => [\&end, "tagname"], | ||||
145 | marked_sections => 1, | ||||
146 | ); | ||||
147 | |||||
148 | # Parse document text chunk by chunk | ||||
149 | $p->parse($chunk1); | ||||
150 | $p->parse($chunk2); | ||||
151 | #... | ||||
152 | $p->eof; # signal end of document | ||||
153 | |||||
154 | # Parse directly from file | ||||
155 | $p->parse_file("foo.html"); | ||||
156 | # or | ||||
157 | open(my $fh, "<:utf8", "foo.html") || die; | ||||
158 | $p->parse_file($fh); | ||||
159 | |||||
160 | =head1 DESCRIPTION | ||||
161 | |||||
162 | Objects of the C<HTML::Parser> class will recognize markup and | ||||
163 | separate it from plain text (alias data content) in HTML | ||||
164 | documents. As different kinds of markup and text are recognized, the | ||||
165 | corresponding event handlers are invoked. | ||||
166 | |||||
167 | C<HTML::Parser> is not a generic SGML parser. We have tried to | ||||
168 | make it able to deal with the HTML that is actually "out there", and | ||||
169 | it normally parses as closely as possible to the way the popular web | ||||
170 | browsers do it instead of strictly following one of the many HTML | ||||
171 | specifications from W3C. Where there is disagreement, there is often | ||||
172 | an option that you can enable to get the official behaviour. | ||||
173 | |||||
174 | The document to be parsed may be supplied in arbitrary chunks. This | ||||
175 | makes on-the-fly parsing as documents are received from the network | ||||
176 | possible. | ||||
177 | |||||
178 | If event driven parsing does not feel right for your application, you | ||||
179 | might want to use C<HTML::PullParser>. This is an C<HTML::Parser> | ||||
180 | subclass that allows a more conventional program structure. | ||||
181 | |||||
182 | |||||
183 | =head1 METHODS | ||||
184 | |||||
185 | The following method is used to construct a new C<HTML::Parser> object: | ||||
186 | |||||
187 | =over | ||||
188 | |||||
189 | =item $p = HTML::Parser->new( %options_and_handlers ) | ||||
190 | |||||
191 | This class method creates a new C<HTML::Parser> object and | ||||
192 | returns it. Key/value argument pairs may be provided to assign event | ||||
193 | handlers or initialize parser options. The handlers and parser | ||||
194 | options can also be set or modified later by the method calls described below. | ||||
195 | |||||
196 | If a top level key is in the form "<event>_h" (e.g., "text_h") then it | ||||
197 | assigns a handler to that event, otherwise it initializes a parser | ||||
198 | option. The event handler specification value must be an array | ||||
199 | reference. Multiple handlers may also be assigned with the 'handlers | ||||
200 | => [%handlers]' option. See examples below. | ||||
201 | |||||
202 | If new() is called without any arguments, it will create a parser that | ||||
203 | uses callback methods compatible with version 2 of C<HTML::Parser>. | ||||
204 | See the section on "version 2 compatibility" below for details. | ||||
205 | |||||
206 | The special constructor option 'api_version => 2' can be used to | ||||
207 | initialize version 2 callbacks while still setting other options and | ||||
208 | handlers. The 'api_version => 3' option can be used if you don't want | ||||
209 | to set any options and don't want to fall back to v2 compatible | ||||
210 | mode. | ||||
211 | |||||
212 | Examples: | ||||
213 | |||||
214 | $p = HTML::Parser->new(api_version => 3, | ||||
215 | text_h => [ sub {...}, "dtext" ]); | ||||
216 | |||||
217 | This creates a new parser object with a text event handler subroutine | ||||
218 | that receives the original text with general entities decoded. | ||||
219 | |||||
220 | $p = HTML::Parser->new(api_version => 3, | ||||
221 | start_h => [ 'my_start', "self,tokens" ]); | ||||
222 | |||||
223 | This creates a new parser object with a start event handler method | ||||
224 | that receives the $p and the tokens array. | ||||
225 | |||||
226 | $p = HTML::Parser->new(api_version => 3, | ||||
227 | handlers => { text => [\@array, "event,text"], | ||||
228 | comment => [\@array, "event,text"], | ||||
229 | }); | ||||
230 | |||||
231 | This creates a new parser object that stores the event type and the | ||||
232 | original text in @array for text and comment events. | ||||
233 | |||||
234 | =back | ||||
235 | |||||
236 | The following methods feed the HTML document | ||||
237 | to the C<HTML::Parser> object: | ||||
238 | |||||
239 | =over | ||||
240 | |||||
241 | =item $p->parse( $string ) | ||||
242 | |||||
243 | Parse $string as the next chunk of the HTML document. The return | ||||
244 | value is normally a reference to the parser object (i.e. $p). | ||||
245 | Handlers invoked should not attempt to modify the $string in-place until | ||||
246 | $p->parse returns. | ||||
247 | |||||
248 | If an invoked event handler aborts parsing by calling $p->eof, then | ||||
249 | $p->parse() will return a FALSE value. | ||||
250 | |||||
251 | =item $p->parse( $code_ref ) | ||||
252 | |||||
253 | If a code reference is passed as the argument to be parsed, then the | ||||
254 | chunks to be parsed are obtained by invoking this function repeatedly. | ||||
255 | Parsing continues until the function returns an empty (or undefined) | ||||
256 | result. When this happens $p->eof is automatically signaled. | ||||
257 | |||||
258 | Parsing will also abort if one of the event handlers calls $p->eof. | ||||
259 | |||||
260 | The effect of this is the same as: | ||||
261 | |||||
262 | while (1) { | ||||
263 | my $chunk = &$code_ref(); | ||||
264 | if (!defined($chunk) || !length($chunk)) { | ||||
265 | $p->eof; | ||||
266 | return $p; | ||||
267 | } | ||||
268 | $p->parse($chunk) || return undef; | ||||
269 | } | ||||
270 | |||||
271 | But it is more efficient as this loop runs internally in XS code. | ||||
272 | |||||
273 | =item $p->parse_file( $file ) | ||||
274 | |||||
275 | Parse text directly from a file. The $file argument can be a | ||||
276 | filename, an open file handle, or a reference to an open file | ||||
277 | handle. | ||||
278 | |||||
279 | If $file contains a filename and the file can't be opened, then the | ||||
280 | method returns an undefined value and $! tells why it failed. | ||||
281 | Otherwise the return value is a reference to the parser object. | ||||
282 | |||||
283 | If a file handle is passed as the $file argument, then the file will | ||||
284 | normally be read until EOF, but not closed. | ||||
285 | |||||
286 | If an invoked event handler aborts parsing by calling $p->eof, | ||||
287 | then $p->parse_file() may not have read the entire file. | ||||
288 | |||||
289 | On systems with multi-byte line terminators, the values passed for the | ||||
290 | offset and length argspecs may be too low if parse_file() is called on | ||||
291 | a file handle that is not in binary mode. | ||||
292 | |||||
293 | If a filename is passed in, then parse_file() will open the file in | ||||
294 | binary mode. | ||||
295 | |||||
296 | =item $p->eof | ||||
297 | |||||
298 | Signals the end of the HTML document. Calling the $p->eof method | ||||
299 | outside a handler callback will flush any remaining buffered text | ||||
300 | (which triggers the C<text> event if there is any remaining text). | ||||
301 | |||||
302 | Calling $p->eof inside a handler will terminate parsing at that point | ||||
303 | and cause $p->parse to return a FALSE value. This also terminates | ||||
304 | parsing by $p->parse_file(). | ||||
305 | |||||
306 | After $p->eof has been called, the parse() and parse_file() methods | ||||
307 | can be invoked to feed new documents with the parser object. | ||||
308 | |||||
309 | The return value from eof() is a reference to the parser object. | ||||
310 | |||||
311 | =back | ||||
312 | |||||
313 | |||||
314 | Most parser options are controlled by boolean attributes. | ||||
315 | Each boolean attribute is enabled by calling the corresponding method | ||||
316 | with a TRUE argument and disabled with a FALSE argument. The | ||||
317 | attribute value is left unchanged if no argument is given. The return | ||||
318 | value from each method is the old attribute value. | ||||
319 | |||||
320 | Methods that can be used to get and/or set parser options are: | ||||
321 | |||||
322 | =over | ||||
323 | |||||
324 | =item $p->attr_encoded | ||||
325 | |||||
326 | =item $p->attr_encoded( $bool ) | ||||
327 | |||||
328 | By default, the C<attr> and C<@attr> argspecs will have general | ||||
329 | entities for attribute values decoded. Enabling this attribute leaves | ||||
330 | entities alone. | ||||
331 | |||||
332 | =item $p->boolean_attribute_value( $val ) | ||||
333 | |||||
334 | This method sets the value reported for boolean attributes inside HTML | ||||
335 | start tags. By default, the name of the attribute is also used as its | ||||
336 | value. This affects the values reported for C<tokens> and C<attr> | ||||
337 | argspecs. | ||||
338 | |||||
339 | =item $p->case_sensitive | ||||
340 | |||||
341 | =item $p->case_sensitive( $bool ) | ||||
342 | |||||
343 | By default, tagnames and attribute names are down-cased. Enabling this | ||||
344 | attribute leaves them as found in the HTML source document. | ||||
345 | |||||
346 | =item $p->closing_plaintext | ||||
347 | |||||
348 | =item $p->closing_plaintext( $bool ) | ||||
349 | |||||
350 | By default, "plaintext" element can never be closed. Everything up to | ||||
351 | the end of the document is parsed in CDATA mode. This historical | ||||
352 | behaviour is what at least MSIE does. Enabling this attribute makes | ||||
353 | closing "</plaintext>" tag effective and the parsing process will resume | ||||
354 | after seeing this tag. This emulates gecko-based browsers. | ||||
355 | |||||
356 | =item $p->empty_element_tags | ||||
357 | |||||
358 | =item $p->empty_element_tags( $bool ) | ||||
359 | |||||
360 | By default, empty element tags are not recognized as such and the "/" | ||||
361 | before ">" is just treated like a normal name character (unless | ||||
362 | C<strict_names> is enabled). Enabling this attribute make | ||||
363 | C<HTML::Parser> recognize these tags. | ||||
364 | |||||
365 | Empty element tags look like start tags, but end with the character | ||||
366 | sequence "/>" instead of ">". When recognized by C<HTML::Parser> they | ||||
367 | cause an artificial end event in addition to the start event. The | ||||
368 | C<text> for the artificial end event will be empty and the C<tokenpos> | ||||
369 | array will be undefined even though the the token array will have one | ||||
370 | element containing the tag name. | ||||
371 | |||||
372 | =item $p->marked_sections | ||||
373 | |||||
374 | =item $p->marked_sections( $bool ) | ||||
375 | |||||
376 | By default, section markings like <![CDATA[...]]> are treated like | ||||
377 | ordinary text. When this attribute is enabled section markings are | ||||
378 | honoured. | ||||
379 | |||||
380 | There are currently no events associated with the marked section | ||||
381 | markup, but the text can be returned as C<skipped_text>. | ||||
382 | |||||
383 | =item $p->strict_comment | ||||
384 | |||||
385 | =item $p->strict_comment( $bool ) | ||||
386 | |||||
387 | By default, comments are terminated by the first occurrence of "-->". | ||||
388 | This is the behaviour of most popular browsers (like Mozilla, Opera and | ||||
389 | MSIE), but it is not correct according to the official HTML | ||||
390 | standard. Officially, you need an even number of "--" tokens before | ||||
391 | the closing ">" is recognized and there may not be anything but | ||||
392 | whitespace between an even and an odd "--". | ||||
393 | |||||
394 | The official behaviour is enabled by enabling this attribute. | ||||
395 | |||||
396 | Enabling of 'strict_comment' also disables recognizing these forms as | ||||
397 | comments: | ||||
398 | |||||
399 | </ comment> | ||||
400 | <! comment> | ||||
401 | |||||
402 | |||||
403 | =item $p->strict_end | ||||
404 | |||||
405 | =item $p->strict_end( $bool ) | ||||
406 | |||||
407 | By default, attributes and other junk are allowed to be present on end tags in a | ||||
408 | manner that emulates MSIE's behaviour. | ||||
409 | |||||
410 | The official behaviour is enabled with this attribute. If enabled, | ||||
411 | only whitespace is allowed between the tagname and the final ">". | ||||
412 | |||||
413 | =item $p->strict_names | ||||
414 | |||||
415 | =item $p->strict_names( $bool ) | ||||
416 | |||||
417 | By default, almost anything is allowed in tag and attribute names. | ||||
418 | This is the behaviour of most popular browsers and allows us to parse | ||||
419 | some broken tags with invalid attribute values like: | ||||
420 | |||||
421 | <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> | ||||
422 | |||||
423 | By default, "LIST]" is parsed as a boolean attribute, not as | ||||
424 | part of the ALT value as was clearly intended. This is also what | ||||
425 | Mozilla sees. | ||||
426 | |||||
427 | The official behaviour is enabled by enabling this attribute. If | ||||
428 | enabled, it will cause the tag above to be reported as text | ||||
429 | since "LIST]" is not a legal attribute name. | ||||
430 | |||||
431 | =item $p->unbroken_text | ||||
432 | |||||
433 | =item $p->unbroken_text( $bool ) | ||||
434 | |||||
435 | By default, blocks of text are given to the text handler as soon as | ||||
436 | possible (but the parser takes care always to break text at a | ||||
437 | boundary between whitespace and non-whitespace so single words and | ||||
438 | entities can always be decoded safely). This might create breaks that | ||||
439 | make it hard to do transformations on the text. When this attribute is | ||||
440 | enabled, blocks of text are always reported in one piece. This will | ||||
441 | delay the text event until the following (non-text) event has been | ||||
442 | recognized by the parser. | ||||
443 | |||||
444 | Note that the C<offset> argspec will give you the offset of the first | ||||
445 | segment of text and C<length> is the combined length of the segments. | ||||
446 | Since there might be ignored tags in between, these numbers can't be | ||||
447 | used to directly index in the original document file. | ||||
448 | |||||
449 | =item $p->utf8_mode | ||||
450 | |||||
451 | =item $p->utf8_mode( $bool ) | ||||
452 | |||||
453 | Enable this option when parsing raw undecoded UTF-8. This tells the | ||||
454 | parser that the entities expanded for strings reported by C<attr>, | ||||
455 | C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end | ||||
456 | up compatible with the surrounding text. | ||||
457 | |||||
458 | If C<utf8_mode> is enabled then it is an error to pass strings | ||||
459 | containing characters with code above 255 to the parse() method, and | ||||
460 | the parse() method will croak if you try. | ||||
461 | |||||
462 | Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 | ||||
463 | encoded. The character can also be represented by the entity | ||||
464 | "♥" or "♥". If we feed the parser: | ||||
465 | |||||
466 | $p->parse("\xE2\x99\xA5♥"); | ||||
467 | |||||
468 | then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without | ||||
469 | C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. | ||||
470 | The later string is what you want. | ||||
471 | |||||
472 | This option is only available with perl-5.8 or better. | ||||
473 | |||||
474 | =item $p->xml_mode | ||||
475 | |||||
476 | =item $p->xml_mode( $bool ) | ||||
477 | |||||
478 | Enabling this attribute changes the parser to allow some XML | ||||
479 | constructs. This enables the behaviour controlled by individually by | ||||
480 | the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and | ||||
481 | C<xml_pic> attributes and also suppresses special treatment of | ||||
482 | elements that are parsed as CDATA for HTML. | ||||
483 | |||||
484 | =item $p->xml_pic | ||||
485 | |||||
486 | =item $p->xml_pic( $bool ) | ||||
487 | |||||
488 | By default, I<processing instructions> are terminated by ">". When | ||||
489 | this attribute is enabled, processing instructions are terminated by | ||||
490 | "?>" instead. | ||||
491 | |||||
492 | =back | ||||
493 | |||||
494 | As markup and text is recognized, handlers are invoked. The following | ||||
495 | method is used to set up handlers for different events: | ||||
496 | |||||
497 | =over | ||||
498 | |||||
499 | =item $p->handler( event => \&subroutine, $argspec ) | ||||
500 | |||||
501 | =item $p->handler( event => $method_name, $argspec ) | ||||
502 | |||||
503 | =item $p->handler( event => \@accum, $argspec ) | ||||
504 | |||||
505 | =item $p->handler( event => "" ); | ||||
506 | |||||
507 | =item $p->handler( event => undef ); | ||||
508 | |||||
509 | =item $p->handler( event ); | ||||
510 | |||||
511 | This method assigns a subroutine, method, or array to handle an event. | ||||
512 | |||||
513 | Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, | ||||
514 | C<process>, C<start_document>, C<end_document> or C<default>. | ||||
515 | |||||
516 | The C<\&subroutine> is a reference to a subroutine which is called to handle | ||||
517 | the event. | ||||
518 | |||||
519 | The C<$method_name> is the name of a method of $p which is called to handle | ||||
520 | the event. | ||||
521 | |||||
522 | The C<@accum> is an array that will hold the event information as | ||||
523 | sub-arrays. | ||||
524 | |||||
525 | If the second argument is "", the event is ignored. | ||||
526 | If it is undef, the default handler is invoked for the event. | ||||
527 | |||||
528 | The C<$argspec> is a string that describes the information to be reported | ||||
529 | for the event. Any requested information that does not apply to a | ||||
530 | specific event is passed as C<undef>. If argspec is omitted, then it | ||||
531 | is left unchanged. | ||||
532 | |||||
533 | The return value from $p->handler is the old callback routine or a | ||||
534 | reference to the accumulator array. | ||||
535 | |||||
536 | Any return values from handler callback routines/methods are always | ||||
537 | ignored. A handler callback can request parsing to be aborted by | ||||
538 | invoking the $p->eof method. A handler callback is not allowed to | ||||
539 | invoke the $p->parse() or $p->parse_file() method. An exception will | ||||
540 | be raised if it tries. | ||||
541 | |||||
542 | Examples: | ||||
543 | |||||
544 | $p->handler(start => "start", 'self, attr, attrseq, text' ); | ||||
545 | |||||
546 | This causes the "start" method of object $p to be called for 'start' events. | ||||
547 | The callback signature is $p->start(\%attr, \@attr_seq, $text). | ||||
548 | |||||
549 | $p->handler(start => \&start, 'attr, attrseq, text' ); | ||||
550 | |||||
551 | This causes subroutine start() to be called for 'start' events. | ||||
552 | The callback signature is start(\%attr, \@attr_seq, $text). | ||||
553 | |||||
554 | $p->handler(start => \@accum, '"S", attr, attrseq, text' ); | ||||
555 | |||||
556 | This causes 'start' event information to be saved in @accum. | ||||
557 | The array elements will be ['S', \%attr, \@attr_seq, $text]. | ||||
558 | |||||
559 | $p->handler(start => ""); | ||||
560 | |||||
561 | This causes 'start' events to be ignored. It also suppresses | ||||
562 | invocations of any default handler for start events. It is in most | ||||
563 | cases equivalent to $p->handler(start => sub {}), but is more | ||||
564 | efficient. It is different from the empty-sub-handler in that | ||||
565 | C<skipped_text> is not reset by it. | ||||
566 | |||||
567 | $p->handler(start => undef); | ||||
568 | |||||
569 | This causes no handler to be associated with start events. | ||||
570 | If there is a default handler it will be invoked. | ||||
571 | |||||
572 | =back | ||||
573 | |||||
574 | Filters based on tags can be set up to limit the number of events | ||||
575 | reported. The main bottleneck during parsing is often the huge number | ||||
576 | of callbacks made from the parser. Applying filters can improve | ||||
577 | performance significantly. | ||||
578 | |||||
579 | The following methods control filters: | ||||
580 | |||||
581 | =over | ||||
582 | |||||
583 | =item $p->ignore_elements( @tags ) | ||||
584 | |||||
585 | Both the C<start> event and the C<end> event as well as any events that | ||||
586 | would be reported in between are suppressed. The ignored elements can | ||||
587 | contain nested occurrences of itself. Example: | ||||
588 | |||||
589 | $p->ignore_elements(qw(script style)); | ||||
590 | |||||
591 | The C<script> and C<style> tags will always nest properly since their | ||||
592 | content is parsed in CDATA mode. For most other tags | ||||
593 | C<ignore_elements> must be used with caution since HTML is often not | ||||
594 | I<well formed>. | ||||
595 | |||||
596 | =item $p->ignore_tags( @tags ) | ||||
597 | |||||
598 | Any C<start> and C<end> events involving any of the tags given are | ||||
599 | suppressed. To reset the filter (i.e. don't suppress any C<start> and | ||||
600 | C<end> events), call C<ignore_tags> without an argument. | ||||
601 | |||||
602 | =item $p->report_tags( @tags ) | ||||
603 | |||||
604 | Any C<start> and C<end> events involving any of the tags I<not> given | ||||
605 | are suppressed. To reset the filter (i.e. report all C<start> and | ||||
606 | C<end> events), call C<report_tags> without an argument. | ||||
607 | |||||
608 | =back | ||||
609 | |||||
610 | Internally, the system has two filter lists, one for C<report_tags> | ||||
611 | and one for C<ignore_tags>, and both filters are applied. This | ||||
612 | effectively gives C<ignore_tags> precedence over C<report_tags>. | ||||
613 | |||||
614 | Examples: | ||||
615 | |||||
616 | $p->ignore_tags(qw(style)); | ||||
617 | $p->report_tags(qw(script style)); | ||||
618 | |||||
619 | results in only C<script> events being reported. | ||||
620 | |||||
621 | =head2 Argspec | ||||
622 | |||||
623 | Argspec is a string containing a comma-separated list that describes | ||||
624 | the information reported by the event. The following argspec | ||||
625 | identifier names can be used: | ||||
626 | |||||
627 | =over | ||||
628 | |||||
629 | =item C<attr> | ||||
630 | |||||
631 | Attr causes a reference to a hash of attribute name/value pairs to be | ||||
632 | passed. | ||||
633 | |||||
634 | Boolean attributes' values are either the value set by | ||||
635 | $p->boolean_attribute_value, or the attribute name if no value has been | ||||
636 | set by $p->boolean_attribute_value. | ||||
637 | |||||
638 | This passes undef except for C<start> events. | ||||
639 | |||||
640 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute | ||||
641 | names are forced to lower case. | ||||
642 | |||||
643 | General entities are decoded in the attribute values and | ||||
644 | one layer of matching quotes enclosing the attribute values is removed. | ||||
645 | |||||
646 | The Unicode character set is assumed for entity decoding. With Perl | ||||
647 | version 5.6 or earlier only the Latin-1 range is supported, and | ||||
648 | entities for characters outside the range 0..255 are left unchanged. | ||||
649 | |||||
650 | =item C<@attr> | ||||
651 | |||||
652 | Basically the same as C<attr>, but keys and values are passed as | ||||
653 | individual arguments and the original sequence of the attributes is | ||||
654 | kept. The parameters passed will be the same as the @attr calculated | ||||
655 | here: | ||||
656 | |||||
657 | @attr = map { $_ => $attr->{$_} } @$attrseq; | ||||
658 | |||||
659 | assuming $attr and $attrseq here are the hash and array passed as the | ||||
660 | result of C<attr> and C<attrseq> argspecs. | ||||
661 | |||||
662 | This passes no values for events besides C<start>. | ||||
663 | |||||
664 | =item C<attrseq> | ||||
665 | |||||
666 | Attrseq causes a reference to an array of attribute names to be | ||||
667 | passed. This can be useful if you want to walk the C<attr> hash in | ||||
668 | the original sequence. | ||||
669 | |||||
670 | This passes undef except for C<start> events. | ||||
671 | |||||
672 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute | ||||
673 | names are forced to lower case. | ||||
674 | |||||
675 | =item C<column> | ||||
676 | |||||
677 | Column causes the column number of the start of the event to be passed. | ||||
678 | The first column on a line is 0. | ||||
679 | |||||
680 | =item C<dtext> | ||||
681 | |||||
682 | Dtext causes the decoded text to be passed. General entities are | ||||
683 | automatically decoded unless the event was inside a CDATA section or | ||||
684 | was between literal start and end tags (C<script>, C<style>, | ||||
685 | C<xmp>, and C<plaintext>). | ||||
686 | |||||
687 | The Unicode character set is assumed for entity decoding. With Perl | ||||
688 | version 5.6 or earlier only the Latin-1 range is supported, and | ||||
689 | entities for characters outside the range 0..255 are left unchanged. | ||||
690 | |||||
691 | This passes undef except for C<text> events. | ||||
692 | |||||
693 | =item C<event> | ||||
694 | |||||
695 | Event causes the event name to be passed. | ||||
696 | |||||
697 | The event name is one of C<text>, C<start>, C<end>, C<declaration>, | ||||
698 | C<comment>, C<process>, C<start_document> or C<end_document>. | ||||
699 | |||||
700 | =item C<is_cdata> | ||||
701 | |||||
702 | Is_cdata causes a TRUE value to be passed if the event is inside a CDATA | ||||
703 | section or between literal start and end tags (C<script>, | ||||
704 | C<style>, C<xmp>, and C<plaintext>). | ||||
705 | |||||
706 | if the flag is FALSE for a text event, then you should normally | ||||
707 | either use C<dtext> or decode the entities yourself before the text is | ||||
708 | processed further. | ||||
709 | |||||
710 | =item C<length> | ||||
711 | |||||
712 | Length causes the number of bytes of the source text of the event to | ||||
713 | be passed. | ||||
714 | |||||
715 | =item C<line> | ||||
716 | |||||
717 | Line causes the line number of the start of the event to be passed. | ||||
718 | The first line in the document is 1. Line counting doesn't start | ||||
719 | until at least one handler requests this value to be reported. | ||||
720 | |||||
721 | =item C<offset> | ||||
722 | |||||
723 | Offset causes the byte position in the HTML document of the start of | ||||
724 | the event to be passed. The first byte in the document has offset 0. | ||||
725 | |||||
726 | =item C<offset_end> | ||||
727 | |||||
728 | Offset_end causes the byte position in the HTML document of the end of | ||||
729 | the event to be passed. This is the same as C<offset> + C<length>. | ||||
730 | |||||
731 | =item C<self> | ||||
732 | |||||
733 | Self causes the current object to be passed to the handler. If the | ||||
734 | handler is a method, this must be the first element in the argspec. | ||||
735 | |||||
736 | An alternative to passing self as an argspec is to register closures | ||||
737 | that capture $self by themselves as handlers. Unfortunately this | ||||
738 | creates circular references which prevent the HTML::Parser object | ||||
739 | from being garbage collected. Using the C<self> argspec avoids this | ||||
740 | problem. | ||||
741 | |||||
742 | =item C<skipped_text> | ||||
743 | |||||
744 | Skipped_text returns the concatenated text of all the events that have | ||||
745 | been skipped since the last time an event was reported. Events might | ||||
746 | be skipped because no handler is registered for them or because some | ||||
747 | filter applies. Skipped text also includes marked section markup, | ||||
748 | since there are no events that can catch it. | ||||
749 | |||||
750 | If an C<"">-handler is registered for an event, then the text for this | ||||
751 | event is not included in C<skipped_text>. Skipped text both before | ||||
752 | and after the C<"">-event is included in the next reported | ||||
753 | C<skipped_text>. | ||||
754 | |||||
755 | =item C<tag> | ||||
756 | |||||
757 | Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> | ||||
758 | event and "!" for a declaration. The C<tag> does not have any prefix | ||||
759 | for C<start> events, and is in this case identical to C<tagname>. | ||||
760 | |||||
761 | =item C<tagname> | ||||
762 | |||||
763 | This is the element name (or I<generic identifier> in SGML jargon) for | ||||
764 | start and end tags. Since HTML is case insensitive, this name is | ||||
765 | forced to lower case to ease string matching. | ||||
766 | |||||
767 | Since XML is case sensitive, the tagname case is not changed when | ||||
768 | C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute | ||||
769 | is set. | ||||
770 | |||||
771 | The declaration type of declaration elements is also passed as a tagname, | ||||
772 | even if that is a bit strange. | ||||
773 | In fact, in the current implementation tagname is | ||||
774 | identical to C<token0> except that the name may be forced to lower case. | ||||
775 | |||||
776 | =item C<token0> | ||||
777 | |||||
778 | Token0 causes the original text of the first token string to be | ||||
779 | passed. This should always be the same as $tokens->[0]. | ||||
780 | |||||
781 | For C<declaration> events, this is the declaration type. | ||||
782 | |||||
783 | For C<start> and C<end> events, this is the tag name. | ||||
784 | |||||
785 | For C<process> and non-strict C<comment> events, this is everything | ||||
786 | inside the tag. | ||||
787 | |||||
788 | This passes undef if there are no tokens in the event. | ||||
789 | |||||
790 | =item C<tokenpos> | ||||
791 | |||||
792 | Tokenpos causes a reference to an array of token positions to be | ||||
793 | passed. For each string that appears in C<tokens>, this array | ||||
794 | contains two numbers. The first number is the offset of the start of | ||||
795 | the token in the original C<text> and the second number is the length | ||||
796 | of the token. | ||||
797 | |||||
798 | Boolean attributes in a C<start> event will have (0,0) for the | ||||
799 | attribute value offset and length. | ||||
800 | |||||
801 | This passes undef if there are no tokens in the event (e.g., C<text>) | ||||
802 | and for artificial C<end> events triggered by empty element tags. | ||||
803 | |||||
804 | If you are using these offsets and lengths to modify C<text>, you | ||||
805 | should either work from right to left, or be very careful to calculate | ||||
806 | the changes to the offsets. | ||||
807 | |||||
808 | =item C<tokens> | ||||
809 | |||||
810 | Tokens causes a reference to an array of token strings to be passed. | ||||
811 | The strings are exactly as they were found in the original text, | ||||
812 | no decoding or case changes are applied. | ||||
813 | |||||
814 | For C<declaration> events, the array contains each word, comment, and | ||||
815 | delimited string starting with the declaration type. | ||||
816 | |||||
817 | For C<comment> events, this contains each sub-comment. If | ||||
818 | $p->strict_comments is disabled, there will be only one sub-comment. | ||||
819 | |||||
820 | For C<start> events, this contains the original tag name followed by | ||||
821 | the attribute name/value pairs. The values of boolean attributes will | ||||
822 | be either the value set by $p->boolean_attribute_value, or the | ||||
823 | attribute name if no value has been set by | ||||
824 | $p->boolean_attribute_value. | ||||
825 | |||||
826 | For C<end> events, this contains the original tag name (always one token). | ||||
827 | |||||
828 | For C<process> events, this contains the process instructions (always one | ||||
829 | token). | ||||
830 | |||||
831 | This passes C<undef> for C<text> events. | ||||
832 | |||||
833 | =item C<text> | ||||
834 | |||||
835 | Text causes the source text (including markup element delimiters) to be | ||||
836 | passed. | ||||
837 | |||||
838 | =item C<undef> | ||||
839 | |||||
840 | Pass an undefined value. Useful as padding where the same handler | ||||
841 | routine is registered for multiple events. | ||||
842 | |||||
843 | =item C<'...'> | ||||
844 | |||||
845 | A literal string of 0 to 255 characters enclosed | ||||
846 | in single (') or double (") quotes is passed as entered. | ||||
847 | |||||
848 | =back | ||||
849 | |||||
850 | The whole argspec string can be wrapped up in C<'@{...}'> to signal | ||||
851 | that the resulting event array should be flattened. This only makes a | ||||
852 | difference if an array reference is used as the handler target. | ||||
853 | Consider this example: | ||||
854 | |||||
855 | $p->handler(text => [], 'text'); | ||||
856 | $p->handler(text => [], '@{text}']); | ||||
857 | |||||
858 | With two text events; C<"foo">, C<"bar">; then the first example will end | ||||
859 | up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in | ||||
860 | the handler target array. | ||||
861 | |||||
862 | |||||
863 | =head2 Events | ||||
864 | |||||
865 | Handlers for the following events can be registered: | ||||
866 | |||||
867 | =over | ||||
868 | |||||
869 | =item C<comment> | ||||
870 | |||||
871 | This event is triggered when a markup comment is recognized. | ||||
872 | |||||
873 | Example: | ||||
874 | |||||
875 | <!-- This is a comment -- -- So is this --> | ||||
876 | |||||
877 | =item C<declaration> | ||||
878 | |||||
879 | This event is triggered when a I<markup declaration> is recognized. | ||||
880 | |||||
881 | For typical HTML documents, the only declaration you are | ||||
882 | likely to find is <!DOCTYPE ...>. | ||||
883 | |||||
884 | Example: | ||||
885 | |||||
886 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | ||||
887 | "http://www.w3.org/TR/html40/strict.dtd"> | ||||
888 | |||||
889 | DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. | ||||
890 | |||||
891 | =item C<default> | ||||
892 | |||||
893 | This event is triggered for events that do not have a specific | ||||
894 | handler. You can set up a handler for this event to catch stuff you | ||||
895 | did not want to catch explicitly. | ||||
896 | |||||
897 | =item C<end> | ||||
898 | |||||
899 | This event is triggered when an end tag is recognized. | ||||
900 | |||||
901 | Example: | ||||
902 | |||||
903 | </A> | ||||
904 | |||||
905 | =item C<end_document> | ||||
906 | |||||
907 | This event is triggered when $p->eof is called and after any remaining | ||||
908 | text is flushed. There is no document text associated with this event. | ||||
909 | |||||
910 | =item C<process> | ||||
911 | |||||
912 | This event is triggered when a processing instructions markup is | ||||
913 | recognized. | ||||
914 | |||||
915 | The format and content of processing instructions are system and | ||||
916 | application dependent. | ||||
917 | |||||
918 | Examples: | ||||
919 | |||||
920 | <? HTML processing instructions > | ||||
921 | <? XML processing instructions ?> | ||||
922 | |||||
923 | =item C<start> | ||||
924 | |||||
925 | This event is triggered when a start tag is recognized. | ||||
926 | |||||
927 | Example: | ||||
928 | |||||
929 | <A HREF="http://www.perl.com/"> | ||||
930 | |||||
931 | =item C<start_document> | ||||
932 | |||||
933 | This event is triggered before any other events for a new document. A | ||||
934 | handler for it can be used to initialize stuff. There is no document | ||||
935 | text associated with this event. | ||||
936 | |||||
937 | =item C<text> | ||||
938 | |||||
939 | This event is triggered when plain text (characters) is recognized. | ||||
940 | The text may contain multiple lines. A sequence of text may be broken | ||||
941 | between several text events unless $p->unbroken_text is enabled. | ||||
942 | |||||
943 | The parser will make sure that it does not break a word or a sequence | ||||
944 | of whitespace between two text events. | ||||
945 | |||||
946 | =back | ||||
947 | |||||
948 | =head2 Unicode | ||||
949 | |||||
950 | The C<HTML::Parser> can parse Unicode strings when running under | ||||
951 | perl-5.8 or better. If Unicode is passed to $p->parse() then chunks | ||||
952 | of Unicode will be reported to the handlers. The offset and length | ||||
953 | argspecs will also report their position in terms of characters. | ||||
954 | |||||
955 | It is safe to parse raw undecoded UTF-8 if you either avoid decoding | ||||
956 | entities and make sure to not use I<argspecs> that do, or enable the | ||||
957 | C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be | ||||
958 | useful when parsing from a file where you need the reported offsets | ||||
959 | and lengths to match the byte offsets in the file. | ||||
960 | |||||
961 | If a filename is passed to $p->parse_file() then the file will be read | ||||
962 | in binary mode. This will be fine if the file contains only ASCII or | ||||
963 | Latin-1 characters. If the file contains UTF-8 encoded text then care | ||||
964 | must be taken when decoding entities as described in the previous | ||||
965 | paragraph, but better is to open the file with the UTF-8 layer so that | ||||
966 | it is decoded properly: | ||||
967 | |||||
968 | open(my $fh, "<:utf8", "index.html") || die "...: $!"; | ||||
969 | $p->parse_file($fh); | ||||
970 | |||||
971 | If the file contains text encoded in a charset besides ASCII, Latin-1 | ||||
972 | or UTF-8 then decoding will always be needed. | ||||
973 | |||||
974 | =head1 VERSION 2 COMPATIBILITY | ||||
975 | |||||
976 | When an C<HTML::Parser> object is constructed with no arguments, a set | ||||
977 | of handlers is automatically provided that is compatible with the old | ||||
978 | HTML::Parser version 2 callback methods. | ||||
979 | |||||
980 | This is equivalent to the following method calls: | ||||
981 | |||||
982 | $p->handler(start => "start", "self, tagname, attr, attrseq, text"); | ||||
983 | $p->handler(end => "end", "self, tagname, text"); | ||||
984 | $p->handler(text => "text", "self, text, is_cdata"); | ||||
985 | $p->handler(process => "process", "self, token0, text"); | ||||
986 | $p->handler(comment => | ||||
987 | sub { | ||||
988 | my($self, $tokens) = @_; | ||||
989 | for (@$tokens) {$self->comment($_);}}, | ||||
990 | "self, tokens"); | ||||
991 | $p->handler(declaration => | ||||
992 | sub { | ||||
993 | my $self = shift; | ||||
994 | $self->declaration(substr($_[0], 2, -1));}, | ||||
995 | "self, text"); | ||||
996 | |||||
997 | Setting up these handlers can also be requested with the "api_version => | ||||
998 | 2" constructor option. | ||||
999 | |||||
1000 | =head1 SUBCLASSING | ||||
1001 | |||||
1002 | The C<HTML::Parser> class is subclassable. Parser objects are plain | ||||
1003 | hashes and C<HTML::Parser> reserves only hash keys that start with | ||||
1004 | "_hparser". The parser state can be set up by invoking the init() | ||||
1005 | method, which takes the same arguments as new(). | ||||
1006 | |||||
1007 | =head1 EXAMPLES | ||||
1008 | |||||
1009 | The first simple example shows how you might strip out comments from | ||||
1010 | an HTML document. We achieve this by setting up a comment handler that | ||||
1011 | does nothing and a default handler that will print out anything else: | ||||
1012 | |||||
1013 | use HTML::Parser; | ||||
1014 | HTML::Parser->new(default_h => [sub { print shift }, 'text'], | ||||
1015 | comment_h => [""], | ||||
1016 | )->parse_file(shift || die) || die $!; | ||||
1017 | |||||
1018 | An alternative implementation is: | ||||
1019 | |||||
1020 | use HTML::Parser; | ||||
1021 | HTML::Parser->new(end_document_h => [sub { print shift }, | ||||
1022 | 'skipped_text'], | ||||
1023 | comment_h => [""], | ||||
1024 | )->parse_file(shift || die) || die $!; | ||||
1025 | |||||
1026 | This will in most cases be much more efficient since only a single | ||||
1027 | callback will be made. | ||||
1028 | |||||
1029 | The next example prints out the text that is inside the <title> | ||||
1030 | element of an HTML document. Here we start by setting up a start | ||||
1031 | handler. When it sees the title start tag it enables a text handler | ||||
1032 | that prints any text found and an end handler that will terminate | ||||
1033 | parsing as soon as the title end tag is seen: | ||||
1034 | |||||
1035 | use HTML::Parser (); | ||||
1036 | |||||
1037 | sub start_handler | ||||
1038 | { | ||||
1039 | return if shift ne "title"; | ||||
1040 | my $self = shift; | ||||
1041 | $self->handler(text => sub { print shift }, "dtext"); | ||||
1042 | $self->handler(end => sub { shift->eof if shift eq "title"; }, | ||||
1043 | "tagname,self"); | ||||
1044 | } | ||||
1045 | |||||
1046 | my $p = HTML::Parser->new(api_version => 3); | ||||
1047 | $p->handler( start => \&start_handler, "tagname,self"); | ||||
1048 | $p->parse_file(shift || die) || die $!; | ||||
1049 | print "\n"; | ||||
1050 | |||||
1051 | More examples are found in the F<eg/> directory of the C<HTML-Parser> | ||||
1052 | distribution: the program C<hrefsub> shows how you can edit all links | ||||
1053 | found in a document; the program C<htextsub> shows how to edit the text only; the | ||||
1054 | program C<hstrip> shows how you can strip out certain tags/elements | ||||
1055 | and/or attributes; and the program C<htext> show how to obtain the | ||||
1056 | plain text, but not any script/style content. | ||||
1057 | |||||
1058 | You can browse the F<eg/> directory online from the I<[Browse]> link on | ||||
1059 | the http://search.cpan.org/~gaas/HTML-Parser/ page. | ||||
1060 | |||||
1061 | =head1 BUGS | ||||
1062 | |||||
1063 | The <style> and <script> sections do not end with the first "</", but | ||||
1064 | need the complete corresponding end tag. The standard behaviour is | ||||
1065 | not really practical. | ||||
1066 | |||||
1067 | When the I<strict_comment> option is enabled, we still recognize | ||||
1068 | comments where there is something other than whitespace between even | ||||
1069 | and odd "--" markers. | ||||
1070 | |||||
1071 | Once $p->boolean_attribute_value has been set, there is no way to | ||||
1072 | restore the default behaviour. | ||||
1073 | |||||
1074 | There is currently no way to get both quote characters | ||||
1075 | into the same literal argspec. | ||||
1076 | |||||
1077 | Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them | ||||
1078 | to repeat the previous start tag or close the previous start tag | ||||
1079 | respectively. | ||||
1080 | |||||
1081 | NET tags, e.g. "code/.../" are not recognized. This is SGML | ||||
1082 | shorthand for "<code>...</code>". | ||||
1083 | |||||
1084 | Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not | ||||
1085 | recognized. | ||||
1086 | |||||
1087 | =head1 DIAGNOSTICS | ||||
1088 | |||||
1089 | The following messages may be produced by HTML::Parser. The notation | ||||
1090 | in this listing is the same as used in L<perldiag>: | ||||
1091 | |||||
1092 | =over | ||||
1093 | |||||
1094 | =item Not a reference to a hash | ||||
1095 | |||||
1096 | (F) The object blessed into or subclassed from HTML::Parser is not a | ||||
1097 | hash as required by the HTML::Parser methods. | ||||
1098 | |||||
1099 | =item Bad signature in parser state object at %p | ||||
1100 | |||||
1101 | (F) The _hparser_xs_state element does not refer to a valid state structure. | ||||
1102 | Something must have changed the internal value | ||||
1103 | stored in this hash element, or the memory has been overwritten. | ||||
1104 | |||||
1105 | =item _hparser_xs_state element is not a reference | ||||
1106 | |||||
1107 | (F) The _hparser_xs_state element has been destroyed. | ||||
1108 | |||||
1109 | =item Can't find '_hparser_xs_state' element in HTML::Parser hash | ||||
1110 | |||||
1111 | (F) The _hparser_xs_state element is missing from the parser hash. | ||||
1112 | It was either deleted, or not created when the object was created. | ||||
1113 | |||||
1114 | =item API version %s not supported by HTML::Parser %s | ||||
1115 | |||||
1116 | (F) The constructor option 'api_version' with an argument greater than | ||||
1117 | or equal to 4 is reserved for future extensions. | ||||
1118 | |||||
1119 | =item Bad constructor option '%s' | ||||
1120 | |||||
1121 | (F) An unknown constructor option key was passed to the new() or | ||||
1122 | init() methods. | ||||
1123 | |||||
1124 | =item Parse loop not allowed | ||||
1125 | |||||
1126 | (F) A handler invoked the parse() or parse_file() method. | ||||
1127 | This is not permitted. | ||||
1128 | |||||
1129 | =item marked sections not supported | ||||
1130 | |||||
1131 | (F) The $p->marked_sections() method was invoked in a HTML::Parser | ||||
1132 | module that was compiled without support for marked sections. | ||||
1133 | |||||
1134 | =item Unknown boolean attribute (%d) | ||||
1135 | |||||
1136 | (F) Something is wrong with the internal logic that set up aliases for | ||||
1137 | boolean attributes. | ||||
1138 | |||||
1139 | =item Only code or array references allowed as handler | ||||
1140 | |||||
1141 | (F) The second argument for $p->handler must be either a subroutine | ||||
1142 | reference, then name of a subroutine or method, or a reference to an | ||||
1143 | array. | ||||
1144 | |||||
1145 | =item No handler for %s events | ||||
1146 | |||||
1147 | (F) The first argument to $p->handler must be a valid event name; i.e. one | ||||
1148 | of "start", "end", "text", "process", "declaration" or "comment". | ||||
1149 | |||||
1150 | =item Unrecognized identifier %s in argspec | ||||
1151 | |||||
1152 | (F) The identifier is not a known argspec name. | ||||
1153 | Use one of the names mentioned in the argspec section above. | ||||
1154 | |||||
1155 | =item Literal string is longer than 255 chars in argspec | ||||
1156 | |||||
1157 | (F) The current implementation limits the length of literals in | ||||
1158 | an argspec to 255 characters. Make the literal shorter. | ||||
1159 | |||||
1160 | =item Backslash reserved for literal string in argspec | ||||
1161 | |||||
1162 | (F) The backslash character "\" is not allowed in argspec literals. | ||||
1163 | It is reserved to permit quoting inside a literal in a later version. | ||||
1164 | |||||
1165 | =item Unterminated literal string in argspec | ||||
1166 | |||||
1167 | (F) The terminating quote character for a literal was not found. | ||||
1168 | |||||
1169 | =item Bad argspec (%s) | ||||
1170 | |||||
1171 | (F) Only identifier names, literals, spaces and commas | ||||
1172 | are allowed in argspecs. | ||||
1173 | |||||
1174 | =item Missing comma separator in argspec | ||||
1175 | |||||
1176 | (F) Identifiers in an argspec must be separated with ",". | ||||
1177 | |||||
1178 | =item Parsing of undecoded UTF-8 will give garbage when decoding entities | ||||
1179 | |||||
1180 | (W) The first chunk parsed appears to contain undecoded UTF-8 and one | ||||
1181 | or more argspecs that decode entities are used for the callback | ||||
1182 | handlers. | ||||
1183 | |||||
1184 | The result of decoding will be a mix of encoded and decoded characters | ||||
1185 | for any entities that expand to characters with code above 127. This | ||||
1186 | is not a good thing. | ||||
1187 | |||||
1188 | The solution is to use the Encode::encode_utf8() on the data before | ||||
1189 | feeding it to the $p->parse(). For $p->parse_file() pass a file that | ||||
1190 | has been opened in ":utf8" mode. | ||||
1191 | |||||
1192 | The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode> | ||||
1193 | is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided. | ||||
1194 | |||||
1195 | =item Parsing string decoded with wrong endianess | ||||
1196 | |||||
1197 | (W) The first character in the document is U+FFFE. This is not a | ||||
1198 | legal Unicode character but a byte swapped BOM. The result of parsing | ||||
1199 | will likely be garbage. | ||||
1200 | |||||
1201 | =item Parsing of undecoded UTF-32 | ||||
1202 | |||||
1203 | (W) The parser found the Unicode UTF-32 BOM signature at the start | ||||
1204 | of the document. The result of parsing will likely be garbage. | ||||
1205 | |||||
1206 | =item Parsing of undecoded UTF-16 | ||||
1207 | |||||
1208 | (W) The parser found the Unicode UTF-16 BOM signature at the start of | ||||
1209 | the document. The result of parsing will likely be garbage. | ||||
1210 | |||||
1211 | =back | ||||
1212 | |||||
1213 | =head1 SEE ALSO | ||||
1214 | |||||
1215 | L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, | ||||
1216 | L<HTML::LinkExtor>, L<HTML::Form> | ||||
1217 | |||||
1218 | L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) | ||||
1219 | |||||
1220 | http://www.w3.org/TR/html4 | ||||
1221 | |||||
1222 | More information about marked sections and processing instructions may | ||||
1223 | be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>. | ||||
1224 | |||||
1225 | =head1 COPYRIGHT | ||||
1226 | |||||
1227 | Copyright 1996-2007 Gisle Aas. All rights reserved. | ||||
1228 | Copyright 1999-2000 Michael A. Chase. All rights reserved. | ||||
1229 | |||||
1230 | This library is free software; you can redistribute it and/or | ||||
1231 | modify it under the same terms as Perl itself. | ||||
1232 | |||||
1233 | =cut | ||||
# spent 6.65ms within HTML::Parser::CORE:match which was called 922 times, avg 7µs/call:
# 922 times (6.65ms+0s) by HTML::Parser::init at line 72 of HTML/Parser.pm, avg 7µs/call | |||||
# spent 7.42ms within HTML::Parser::_alloc_pstate which was called 461 times, avg 16µs/call:
# 461 times (7.42ms+0s) by HTML::Parser::init at line 30 of HTML/Parser.pm, avg 16µs/call | |||||
# spent 48µs within HTML::Parser::eof which was called 7 times, avg 7µs/call:
# 7 times (48µs+0s) by HTML::HeadParser::text at line 216 of HTML/HeadParser.pm, avg 7µs/call | |||||
# spent 25.5ms within HTML::Parser::handler which was called 2766 times, avg 9µs/call:
# 461 times (5.89ms+0s) by HTML::Parser::init at line 42 of HTML/Parser.pm, avg 13µs/call
# 461 times (4.47ms+0s) by HTML::Parser::init at line 45 of HTML/Parser.pm, avg 10µs/call
# 461 times (4.13ms+0s) by HTML::Parser::init at line 43 of HTML/Parser.pm, avg 9µs/call
# 461 times (3.98ms+0s) by HTML::Parser::init at line 44 of HTML/Parser.pm, avg 9µs/call
# 461 times (3.52ms+0s) by HTML::Parser::init at line 54 of HTML/Parser.pm, avg 8µs/call
# 461 times (3.48ms+0s) by HTML::Parser::init at line 60 of HTML/Parser.pm, avg 8µs/call | |||||
# spent 5.15ms within HTML::Parser::ignore_elements which was called 461 times, avg 11µs/call:
# 461 times (5.15ms+0s) by HTML::Parser::init at line 80 of HTML/Parser.pm, avg 11µs/call | |||||
# spent 986µs (283+703) within HTML::Parser::parse which was called 7 times, avg 141µs/call:
# 7 times (283µs+703µs) by LWP::Protocol::collect at line 114 of LWP/Protocol.pm, avg 141µs/call |