| File | /project/perl/lib/HTML/Parser.pm |
| Statements Executed | 9236 |
| Statement Execution Time | 179ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 461 | 1 | 1 | 97.4ms | 142ms | HTML::Parser::init |
| 2766 | 6 | 2 | 25.5ms | 25.5ms | HTML::Parser::handler (xsub) |
| 461 | 1 | 1 | 18.0ms | 160ms | HTML::Parser::new |
| 461 | 1 | 2 | 7.42ms | 7.42ms | HTML::Parser::_alloc_pstate (xsub) |
| 922 | 1 | 2 | 6.65ms | 6.65ms | HTML::Parser::CORE:match (opcode) |
| 461 | 1 | 2 | 5.15ms | 5.15ms | HTML::Parser::ignore_elements (xsub) |
| 7 | 1 | 2 | 283µs | 986µs | HTML::Parser::parse (xsub) |
| 7 | 1 | 2 | 48µs | 48µs | HTML::Parser::eof (xsub) |
| 0 | 0 | 0 | 0s | 0s | HTML::Parser::BEGIN |
| 0 | 0 | 0 | 0s | 0s | HTML::Parser::__ANON__[:54] |
| 0 | 0 | 0 | 0s | 0s | HTML::Parser::__ANON__[:60] |
| 0 | 0 | 0 | 0s | 0s | HTML::Parser::netscape_buggy_comment |
| 0 | 0 | 0 | 0s | 0s | HTML::Parser::parse_file |
| 0 | 0 | 0 | 0s | 0s | HTML::Parser::text |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package HTML::Parser; | ||||
| 2 | |||||
| 3 | # Copyright 1996-2007, Gisle Aas. | ||||
| 4 | # Copyright 1999-2000, Michael A. Chase. | ||||
| 5 | # | ||||
| 6 | # This library is free software; you can redistribute it and/or | ||||
| 7 | # modify it under the same terms as Perl itself. | ||||
| 8 | |||||
| 9 | 3 | 107µs | 1 | 27µs | use strict; # spent 27µs making 1 call to strict::import |
| 10 | 3 | 1.41ms | 1 | 240µs | use vars qw($VERSION @ISA); # spent 240µs making 1 call to vars::import |
| 11 | |||||
| 12 | 1 | 6µs | $VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $ | ||
| 13 | |||||
| 14 | 1 | 6µs | require HTML::Entities; | ||
| 15 | |||||
| 16 | 1 | 4µs | require XSLoader; | ||
| 17 | 1 | 16.9ms | 1 | 16.8ms | XSLoader::load('HTML::Parser', $VERSION); # spent 16.8ms making 1 call to XSLoader::load |
| 18 | |||||
| 19 | sub new | ||||
| 20 | # spent 160ms (18.0+142) within HTML::Parser::new which was called 461 times, avg 347µs/call:
# 461 times (18.0ms+142ms) by HTML::HeadParser::new at line 103 of HTML/HeadParser.pm, avg 347µs/call | ||||
| 21 | 1383 | 17.5ms | my $class = shift; | ||
| 22 | my $self = bless {}, $class; | ||||
| 23 | return $self->init(@_); # spent 142ms making 461 calls to HTML::Parser::init, avg 308µs/call | ||||
| 24 | } | ||||
| 25 | |||||
| 26 | |||||
| 27 | sub init | ||||
| 28 | # spent 142ms (97.4+44.7) within HTML::Parser::init which was called 461 times, avg 308µs/call:
# 461 times (97.4ms+44.7ms) by HTML::Parser::new at line 23, avg 308µs/call | ||||
| 29 | 7837 | 143ms | my $self = shift; | ||
| 30 | $self->_alloc_pstate; # spent 7.42ms making 461 calls to HTML::Parser::_alloc_pstate, avg 16µs/call | ||||
| 31 | |||||
| 32 | my %arg = @_; | ||||
| 33 | my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); | ||||
| 34 | if ($api_version >= 4) { | ||||
| 35 | require Carp; | ||||
| 36 | Carp::croak("API version $api_version not supported " . | ||||
| 37 | "by HTML::Parser $VERSION"); | ||||
| 38 | } | ||||
| 39 | |||||
| 40 | if ($api_version < 3) { | ||||
| 41 | # Set up method callbacks compatible with HTML-Parser-2.xx | ||||
| 42 | $self->handler(text => "text", "self,text,is_cdata"); # spent 5.89ms making 461 calls to HTML::Parser::handler, avg 13µs/call | ||||
| 43 | $self->handler(end => "end", "self,tagname,text"); # spent 4.13ms making 461 calls to HTML::Parser::handler, avg 9µs/call | ||||
| 44 | $self->handler(process => "process", "self,token0,text"); # spent 3.98ms making 461 calls to HTML::Parser::handler, avg 9µs/call | ||||
| 45 | $self->handler(start => "start", # spent 4.47ms making 461 calls to HTML::Parser::handler, avg 10µs/call | ||||
| 46 | "self,tagname,attr,attrseq,text"); | ||||
| 47 | |||||
| 48 | $self->handler(comment => | ||||
| 49 | sub { | ||||
| 50 | my($self, $tokens) = @_; | ||||
| 51 | for (@$tokens) { | ||||
| 52 | $self->comment($_); | ||||
| 53 | } | ||||
| 54 | }, "self,tokens"); # spent 3.52ms making 461 calls to HTML::Parser::handler, avg 8µs/call | ||||
| 55 | |||||
| 56 | $self->handler(declaration => | ||||
| 57 | sub { | ||||
| 58 | my $self = shift; | ||||
| 59 | $self->declaration(substr($_[0], 2, -1)); | ||||
| 60 | }, "self,text"); # spent 3.48ms making 461 calls to HTML::Parser::handler, avg 8µs/call | ||||
| 61 | } | ||||
| 62 | |||||
| 63 | if (my $h = delete $arg{handlers}) { | ||||
| 64 | $h = {@$h} if ref($h) eq "ARRAY"; | ||||
| 65 | while (my($event, $cb) = each %$h) { | ||||
| 66 | $self->handler($event => @$cb); | ||||
| 67 | } | ||||
| 68 | } | ||||
| 69 | |||||
| 70 | # In the end we try to assume plain attribute or handler | ||||
| 71 | while (my($option, $val) = each %arg) { | ||||
| 72 | if ($option =~ /^(\w+)_h$/) { # spent 6.65ms making 922 calls to HTML::Parser::CORE:match, avg 7µs/call | ||||
| 73 | $self->handler($1 => @$val); | ||||
| 74 | } | ||||
| 75 | elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { | ||||
| 76 | require Carp; | ||||
| 77 | Carp::croak("Bad constructor option '$option'"); | ||||
| 78 | } | ||||
| 79 | else { | ||||
| 80 | $self->$option($val); # spent 5.15ms making 461 calls to HTML::Parser::ignore_elements, avg 11µs/call | ||||
| 81 | } | ||||
| 82 | } | ||||
| 83 | |||||
| 84 | return $self; | ||||
| 85 | } | ||||
| 86 | |||||
| 87 | |||||
| 88 | sub parse_file | ||||
| 89 | { | ||||
| 90 | my($self, $file) = @_; | ||||
| 91 | my $opened; | ||||
| 92 | if (!ref($file) && ref(\$file) ne "GLOB") { | ||||
| 93 | # Assume $file is a filename | ||||
| 94 | local(*F); | ||||
| 95 | open(F, $file) || return undef; | ||||
| 96 | binmode(F); # should we? good for byte counts | ||||
| 97 | $opened++; | ||||
| 98 | $file = *F; | ||||
| 99 | } | ||||
| 100 | my $chunk = ''; | ||||
| 101 | while (read($file, $chunk, 512)) { | ||||
| 102 | $self->parse($chunk) || last; | ||||
| 103 | } | ||||
| 104 | close($file) if $opened; | ||||
| 105 | $self->eof; | ||||
| 106 | } | ||||
| 107 | |||||
| 108 | |||||
| 109 | sub netscape_buggy_comment # legacy | ||||
| 110 | { | ||||
| 111 | my $self = shift; | ||||
| 112 | require Carp; | ||||
| 113 | Carp::carp("netscape_buggy_comment() is deprecated. " . | ||||
| 114 | "Please use the strict_comment() method instead"); | ||||
| 115 | my $old = !$self->strict_comment; | ||||
| 116 | $self->strict_comment(!shift) if @_; | ||||
| 117 | return $old; | ||||
| 118 | } | ||||
| 119 | |||||
| 120 | # set up method stubs | ||||
| 121 | sub text { } | ||||
| 122 | 1 | 9µs | *start = \&text; | ||
| 123 | 1 | 5µs | *end = \&text; | ||
| 124 | 1 | 6µs | *comment = \&text; | ||
| 125 | 1 | 4µs | *declaration = \&text; | ||
| 126 | 1 | 5µs | *process = \&text; | ||
| 127 | |||||
| 128 | 1 | 26µs | 1; | ||
| 129 | |||||
| 130 | __END__ | ||||
| 131 | |||||
| 132 | |||||
| 133 | =head1 NAME | ||||
| 134 | |||||
| 135 | HTML::Parser - HTML parser class | ||||
| 136 | |||||
| 137 | =head1 SYNOPSIS | ||||
| 138 | |||||
| 139 | use HTML::Parser (); | ||||
| 140 | |||||
| 141 | # Create parser object | ||||
| 142 | $p = HTML::Parser->new( api_version => 3, | ||||
| 143 | start_h => [\&start, "tagname, attr"], | ||||
| 144 | end_h => [\&end, "tagname"], | ||||
| 145 | marked_sections => 1, | ||||
| 146 | ); | ||||
| 147 | |||||
| 148 | # Parse document text chunk by chunk | ||||
| 149 | $p->parse($chunk1); | ||||
| 150 | $p->parse($chunk2); | ||||
| 151 | #... | ||||
| 152 | $p->eof; # signal end of document | ||||
| 153 | |||||
| 154 | # Parse directly from file | ||||
| 155 | $p->parse_file("foo.html"); | ||||
| 156 | # or | ||||
| 157 | open(my $fh, "<:utf8", "foo.html") || die; | ||||
| 158 | $p->parse_file($fh); | ||||
| 159 | |||||
| 160 | =head1 DESCRIPTION | ||||
| 161 | |||||
| 162 | Objects of the C<HTML::Parser> class will recognize markup and | ||||
| 163 | separate it from plain text (alias data content) in HTML | ||||
| 164 | documents. As different kinds of markup and text are recognized, the | ||||
| 165 | corresponding event handlers are invoked. | ||||
| 166 | |||||
| 167 | C<HTML::Parser> is not a generic SGML parser. We have tried to | ||||
| 168 | make it able to deal with the HTML that is actually "out there", and | ||||
| 169 | it normally parses as closely as possible to the way the popular web | ||||
| 170 | browsers do it instead of strictly following one of the many HTML | ||||
| 171 | specifications from W3C. Where there is disagreement, there is often | ||||
| 172 | an option that you can enable to get the official behaviour. | ||||
| 173 | |||||
| 174 | The document to be parsed may be supplied in arbitrary chunks. This | ||||
| 175 | makes on-the-fly parsing as documents are received from the network | ||||
| 176 | possible. | ||||
| 177 | |||||
| 178 | If event driven parsing does not feel right for your application, you | ||||
| 179 | might want to use C<HTML::PullParser>. This is an C<HTML::Parser> | ||||
| 180 | subclass that allows a more conventional program structure. | ||||
| 181 | |||||
| 182 | |||||
| 183 | =head1 METHODS | ||||
| 184 | |||||
| 185 | The following method is used to construct a new C<HTML::Parser> object: | ||||
| 186 | |||||
| 187 | =over | ||||
| 188 | |||||
| 189 | =item $p = HTML::Parser->new( %options_and_handlers ) | ||||
| 190 | |||||
| 191 | This class method creates a new C<HTML::Parser> object and | ||||
| 192 | returns it. Key/value argument pairs may be provided to assign event | ||||
| 193 | handlers or initialize parser options. The handlers and parser | ||||
| 194 | options can also be set or modified later by the method calls described below. | ||||
| 195 | |||||
| 196 | If a top level key is in the form "<event>_h" (e.g., "text_h") then it | ||||
| 197 | assigns a handler to that event, otherwise it initializes a parser | ||||
| 198 | option. The event handler specification value must be an array | ||||
| 199 | reference. Multiple handlers may also be assigned with the 'handlers | ||||
| 200 | => [%handlers]' option. See examples below. | ||||
| 201 | |||||
| 202 | If new() is called without any arguments, it will create a parser that | ||||
| 203 | uses callback methods compatible with version 2 of C<HTML::Parser>. | ||||
| 204 | See the section on "version 2 compatibility" below for details. | ||||
| 205 | |||||
| 206 | The special constructor option 'api_version => 2' can be used to | ||||
| 207 | initialize version 2 callbacks while still setting other options and | ||||
| 208 | handlers. The 'api_version => 3' option can be used if you don't want | ||||
| 209 | to set any options and don't want to fall back to v2 compatible | ||||
| 210 | mode. | ||||
| 211 | |||||
| 212 | Examples: | ||||
| 213 | |||||
| 214 | $p = HTML::Parser->new(api_version => 3, | ||||
| 215 | text_h => [ sub {...}, "dtext" ]); | ||||
| 216 | |||||
| 217 | This creates a new parser object with a text event handler subroutine | ||||
| 218 | that receives the original text with general entities decoded. | ||||
| 219 | |||||
| 220 | $p = HTML::Parser->new(api_version => 3, | ||||
| 221 | start_h => [ 'my_start', "self,tokens" ]); | ||||
| 222 | |||||
| 223 | This creates a new parser object with a start event handler method | ||||
| 224 | that receives the $p and the tokens array. | ||||
| 225 | |||||
| 226 | $p = HTML::Parser->new(api_version => 3, | ||||
| 227 | handlers => { text => [\@array, "event,text"], | ||||
| 228 | comment => [\@array, "event,text"], | ||||
| 229 | }); | ||||
| 230 | |||||
| 231 | This creates a new parser object that stores the event type and the | ||||
| 232 | original text in @array for text and comment events. | ||||
| 233 | |||||
| 234 | =back | ||||
| 235 | |||||
| 236 | The following methods feed the HTML document | ||||
| 237 | to the C<HTML::Parser> object: | ||||
| 238 | |||||
| 239 | =over | ||||
| 240 | |||||
| 241 | =item $p->parse( $string ) | ||||
| 242 | |||||
| 243 | Parse $string as the next chunk of the HTML document. The return | ||||
| 244 | value is normally a reference to the parser object (i.e. $p). | ||||
| 245 | Handlers invoked should not attempt to modify the $string in-place until | ||||
| 246 | $p->parse returns. | ||||
| 247 | |||||
| 248 | If an invoked event handler aborts parsing by calling $p->eof, then | ||||
| 249 | $p->parse() will return a FALSE value. | ||||
| 250 | |||||
| 251 | =item $p->parse( $code_ref ) | ||||
| 252 | |||||
| 253 | If a code reference is passed as the argument to be parsed, then the | ||||
| 254 | chunks to be parsed are obtained by invoking this function repeatedly. | ||||
| 255 | Parsing continues until the function returns an empty (or undefined) | ||||
| 256 | result. When this happens $p->eof is automatically signaled. | ||||
| 257 | |||||
| 258 | Parsing will also abort if one of the event handlers calls $p->eof. | ||||
| 259 | |||||
| 260 | The effect of this is the same as: | ||||
| 261 | |||||
| 262 | while (1) { | ||||
| 263 | my $chunk = &$code_ref(); | ||||
| 264 | if (!defined($chunk) || !length($chunk)) { | ||||
| 265 | $p->eof; | ||||
| 266 | return $p; | ||||
| 267 | } | ||||
| 268 | $p->parse($chunk) || return undef; | ||||
| 269 | } | ||||
| 270 | |||||
| 271 | But it is more efficient as this loop runs internally in XS code. | ||||
| 272 | |||||
| 273 | =item $p->parse_file( $file ) | ||||
| 274 | |||||
| 275 | Parse text directly from a file. The $file argument can be a | ||||
| 276 | filename, an open file handle, or a reference to an open file | ||||
| 277 | handle. | ||||
| 278 | |||||
| 279 | If $file contains a filename and the file can't be opened, then the | ||||
| 280 | method returns an undefined value and $! tells why it failed. | ||||
| 281 | Otherwise the return value is a reference to the parser object. | ||||
| 282 | |||||
| 283 | If a file handle is passed as the $file argument, then the file will | ||||
| 284 | normally be read until EOF, but not closed. | ||||
| 285 | |||||
| 286 | If an invoked event handler aborts parsing by calling $p->eof, | ||||
| 287 | then $p->parse_file() may not have read the entire file. | ||||
| 288 | |||||
| 289 | On systems with multi-byte line terminators, the values passed for the | ||||
| 290 | offset and length argspecs may be too low if parse_file() is called on | ||||
| 291 | a file handle that is not in binary mode. | ||||
| 292 | |||||
| 293 | If a filename is passed in, then parse_file() will open the file in | ||||
| 294 | binary mode. | ||||
| 295 | |||||
| 296 | =item $p->eof | ||||
| 297 | |||||
| 298 | Signals the end of the HTML document. Calling the $p->eof method | ||||
| 299 | outside a handler callback will flush any remaining buffered text | ||||
| 300 | (which triggers the C<text> event if there is any remaining text). | ||||
| 301 | |||||
| 302 | Calling $p->eof inside a handler will terminate parsing at that point | ||||
| 303 | and cause $p->parse to return a FALSE value. This also terminates | ||||
| 304 | parsing by $p->parse_file(). | ||||
| 305 | |||||
| 306 | After $p->eof has been called, the parse() and parse_file() methods | ||||
| 307 | can be invoked to feed new documents with the parser object. | ||||
| 308 | |||||
| 309 | The return value from eof() is a reference to the parser object. | ||||
| 310 | |||||
| 311 | =back | ||||
| 312 | |||||
| 313 | |||||
| 314 | Most parser options are controlled by boolean attributes. | ||||
| 315 | Each boolean attribute is enabled by calling the corresponding method | ||||
| 316 | with a TRUE argument and disabled with a FALSE argument. The | ||||
| 317 | attribute value is left unchanged if no argument is given. The return | ||||
| 318 | value from each method is the old attribute value. | ||||
| 319 | |||||
| 320 | Methods that can be used to get and/or set parser options are: | ||||
| 321 | |||||
| 322 | =over | ||||
| 323 | |||||
| 324 | =item $p->attr_encoded | ||||
| 325 | |||||
| 326 | =item $p->attr_encoded( $bool ) | ||||
| 327 | |||||
| 328 | By default, the C<attr> and C<@attr> argspecs will have general | ||||
| 329 | entities for attribute values decoded. Enabling this attribute leaves | ||||
| 330 | entities alone. | ||||
| 331 | |||||
| 332 | =item $p->boolean_attribute_value( $val ) | ||||
| 333 | |||||
| 334 | This method sets the value reported for boolean attributes inside HTML | ||||
| 335 | start tags. By default, the name of the attribute is also used as its | ||||
| 336 | value. This affects the values reported for C<tokens> and C<attr> | ||||
| 337 | argspecs. | ||||
| 338 | |||||
| 339 | =item $p->case_sensitive | ||||
| 340 | |||||
| 341 | =item $p->case_sensitive( $bool ) | ||||
| 342 | |||||
| 343 | By default, tagnames and attribute names are down-cased. Enabling this | ||||
| 344 | attribute leaves them as found in the HTML source document. | ||||
| 345 | |||||
| 346 | =item $p->closing_plaintext | ||||
| 347 | |||||
| 348 | =item $p->closing_plaintext( $bool ) | ||||
| 349 | |||||
| 350 | By default, "plaintext" element can never be closed. Everything up to | ||||
| 351 | the end of the document is parsed in CDATA mode. This historical | ||||
| 352 | behaviour is what at least MSIE does. Enabling this attribute makes | ||||
| 353 | closing "</plaintext>" tag effective and the parsing process will resume | ||||
| 354 | after seeing this tag. This emulates gecko-based browsers. | ||||
| 355 | |||||
| 356 | =item $p->empty_element_tags | ||||
| 357 | |||||
| 358 | =item $p->empty_element_tags( $bool ) | ||||
| 359 | |||||
| 360 | By default, empty element tags are not recognized as such and the "/" | ||||
| 361 | before ">" is just treated like a normal name character (unless | ||||
| 362 | C<strict_names> is enabled). Enabling this attribute make | ||||
| 363 | C<HTML::Parser> recognize these tags. | ||||
| 364 | |||||
| 365 | Empty element tags look like start tags, but end with the character | ||||
| 366 | sequence "/>" instead of ">". When recognized by C<HTML::Parser> they | ||||
| 367 | cause an artificial end event in addition to the start event. The | ||||
| 368 | C<text> for the artificial end event will be empty and the C<tokenpos> | ||||
| 369 | array will be undefined even though the the token array will have one | ||||
| 370 | element containing the tag name. | ||||
| 371 | |||||
| 372 | =item $p->marked_sections | ||||
| 373 | |||||
| 374 | =item $p->marked_sections( $bool ) | ||||
| 375 | |||||
| 376 | By default, section markings like <![CDATA[...]]> are treated like | ||||
| 377 | ordinary text. When this attribute is enabled section markings are | ||||
| 378 | honoured. | ||||
| 379 | |||||
| 380 | There are currently no events associated with the marked section | ||||
| 381 | markup, but the text can be returned as C<skipped_text>. | ||||
| 382 | |||||
| 383 | =item $p->strict_comment | ||||
| 384 | |||||
| 385 | =item $p->strict_comment( $bool ) | ||||
| 386 | |||||
| 387 | By default, comments are terminated by the first occurrence of "-->". | ||||
| 388 | This is the behaviour of most popular browsers (like Mozilla, Opera and | ||||
| 389 | MSIE), but it is not correct according to the official HTML | ||||
| 390 | standard. Officially, you need an even number of "--" tokens before | ||||
| 391 | the closing ">" is recognized and there may not be anything but | ||||
| 392 | whitespace between an even and an odd "--". | ||||
| 393 | |||||
| 394 | The official behaviour is enabled by enabling this attribute. | ||||
| 395 | |||||
| 396 | Enabling of 'strict_comment' also disables recognizing these forms as | ||||
| 397 | comments: | ||||
| 398 | |||||
| 399 | </ comment> | ||||
| 400 | <! comment> | ||||
| 401 | |||||
| 402 | |||||
| 403 | =item $p->strict_end | ||||
| 404 | |||||
| 405 | =item $p->strict_end( $bool ) | ||||
| 406 | |||||
| 407 | By default, attributes and other junk are allowed to be present on end tags in a | ||||
| 408 | manner that emulates MSIE's behaviour. | ||||
| 409 | |||||
| 410 | The official behaviour is enabled with this attribute. If enabled, | ||||
| 411 | only whitespace is allowed between the tagname and the final ">". | ||||
| 412 | |||||
| 413 | =item $p->strict_names | ||||
| 414 | |||||
| 415 | =item $p->strict_names( $bool ) | ||||
| 416 | |||||
| 417 | By default, almost anything is allowed in tag and attribute names. | ||||
| 418 | This is the behaviour of most popular browsers and allows us to parse | ||||
| 419 | some broken tags with invalid attribute values like: | ||||
| 420 | |||||
| 421 | <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> | ||||
| 422 | |||||
| 423 | By default, "LIST]" is parsed as a boolean attribute, not as | ||||
| 424 | part of the ALT value as was clearly intended. This is also what | ||||
| 425 | Mozilla sees. | ||||
| 426 | |||||
| 427 | The official behaviour is enabled by enabling this attribute. If | ||||
| 428 | enabled, it will cause the tag above to be reported as text | ||||
| 429 | since "LIST]" is not a legal attribute name. | ||||
| 430 | |||||
| 431 | =item $p->unbroken_text | ||||
| 432 | |||||
| 433 | =item $p->unbroken_text( $bool ) | ||||
| 434 | |||||
| 435 | By default, blocks of text are given to the text handler as soon as | ||||
| 436 | possible (but the parser takes care always to break text at a | ||||
| 437 | boundary between whitespace and non-whitespace so single words and | ||||
| 438 | entities can always be decoded safely). This might create breaks that | ||||
| 439 | make it hard to do transformations on the text. When this attribute is | ||||
| 440 | enabled, blocks of text are always reported in one piece. This will | ||||
| 441 | delay the text event until the following (non-text) event has been | ||||
| 442 | recognized by the parser. | ||||
| 443 | |||||
| 444 | Note that the C<offset> argspec will give you the offset of the first | ||||
| 445 | segment of text and C<length> is the combined length of the segments. | ||||
| 446 | Since there might be ignored tags in between, these numbers can't be | ||||
| 447 | used to directly index in the original document file. | ||||
| 448 | |||||
| 449 | =item $p->utf8_mode | ||||
| 450 | |||||
| 451 | =item $p->utf8_mode( $bool ) | ||||
| 452 | |||||
| 453 | Enable this option when parsing raw undecoded UTF-8. This tells the | ||||
| 454 | parser that the entities expanded for strings reported by C<attr>, | ||||
| 455 | C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end | ||||
| 456 | up compatible with the surrounding text. | ||||
| 457 | |||||
| 458 | If C<utf8_mode> is enabled then it is an error to pass strings | ||||
| 459 | containing characters with code above 255 to the parse() method, and | ||||
| 460 | the parse() method will croak if you try. | ||||
| 461 | |||||
| 462 | Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 | ||||
| 463 | encoded. The character can also be represented by the entity | ||||
| 464 | "♥" or "♥". If we feed the parser: | ||||
| 465 | |||||
| 466 | $p->parse("\xE2\x99\xA5♥"); | ||||
| 467 | |||||
| 468 | then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without | ||||
| 469 | C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. | ||||
| 470 | The later string is what you want. | ||||
| 471 | |||||
| 472 | This option is only available with perl-5.8 or better. | ||||
| 473 | |||||
| 474 | =item $p->xml_mode | ||||
| 475 | |||||
| 476 | =item $p->xml_mode( $bool ) | ||||
| 477 | |||||
| 478 | Enabling this attribute changes the parser to allow some XML | ||||
| 479 | constructs. This enables the behaviour controlled by individually by | ||||
| 480 | the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and | ||||
| 481 | C<xml_pic> attributes and also suppresses special treatment of | ||||
| 482 | elements that are parsed as CDATA for HTML. | ||||
| 483 | |||||
| 484 | =item $p->xml_pic | ||||
| 485 | |||||
| 486 | =item $p->xml_pic( $bool ) | ||||
| 487 | |||||
| 488 | By default, I<processing instructions> are terminated by ">". When | ||||
| 489 | this attribute is enabled, processing instructions are terminated by | ||||
| 490 | "?>" instead. | ||||
| 491 | |||||
| 492 | =back | ||||
| 493 | |||||
| 494 | As markup and text is recognized, handlers are invoked. The following | ||||
| 495 | method is used to set up handlers for different events: | ||||
| 496 | |||||
| 497 | =over | ||||
| 498 | |||||
| 499 | =item $p->handler( event => \&subroutine, $argspec ) | ||||
| 500 | |||||
| 501 | =item $p->handler( event => $method_name, $argspec ) | ||||
| 502 | |||||
| 503 | =item $p->handler( event => \@accum, $argspec ) | ||||
| 504 | |||||
| 505 | =item $p->handler( event => "" ); | ||||
| 506 | |||||
| 507 | =item $p->handler( event => undef ); | ||||
| 508 | |||||
| 509 | =item $p->handler( event ); | ||||
| 510 | |||||
| 511 | This method assigns a subroutine, method, or array to handle an event. | ||||
| 512 | |||||
| 513 | Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, | ||||
| 514 | C<process>, C<start_document>, C<end_document> or C<default>. | ||||
| 515 | |||||
| 516 | The C<\&subroutine> is a reference to a subroutine which is called to handle | ||||
| 517 | the event. | ||||
| 518 | |||||
| 519 | The C<$method_name> is the name of a method of $p which is called to handle | ||||
| 520 | the event. | ||||
| 521 | |||||
| 522 | The C<@accum> is an array that will hold the event information as | ||||
| 523 | sub-arrays. | ||||
| 524 | |||||
| 525 | If the second argument is "", the event is ignored. | ||||
| 526 | If it is undef, the default handler is invoked for the event. | ||||
| 527 | |||||
| 528 | The C<$argspec> is a string that describes the information to be reported | ||||
| 529 | for the event. Any requested information that does not apply to a | ||||
| 530 | specific event is passed as C<undef>. If argspec is omitted, then it | ||||
| 531 | is left unchanged. | ||||
| 532 | |||||
| 533 | The return value from $p->handler is the old callback routine or a | ||||
| 534 | reference to the accumulator array. | ||||
| 535 | |||||
| 536 | Any return values from handler callback routines/methods are always | ||||
| 537 | ignored. A handler callback can request parsing to be aborted by | ||||
| 538 | invoking the $p->eof method. A handler callback is not allowed to | ||||
| 539 | invoke the $p->parse() or $p->parse_file() method. An exception will | ||||
| 540 | be raised if it tries. | ||||
| 541 | |||||
| 542 | Examples: | ||||
| 543 | |||||
| 544 | $p->handler(start => "start", 'self, attr, attrseq, text' ); | ||||
| 545 | |||||
| 546 | This causes the "start" method of object $p to be called for 'start' events. | ||||
| 547 | The callback signature is $p->start(\%attr, \@attr_seq, $text). | ||||
| 548 | |||||
| 549 | $p->handler(start => \&start, 'attr, attrseq, text' ); | ||||
| 550 | |||||
| 551 | This causes subroutine start() to be called for 'start' events. | ||||
| 552 | The callback signature is start(\%attr, \@attr_seq, $text). | ||||
| 553 | |||||
| 554 | $p->handler(start => \@accum, '"S", attr, attrseq, text' ); | ||||
| 555 | |||||
| 556 | This causes 'start' event information to be saved in @accum. | ||||
| 557 | The array elements will be ['S', \%attr, \@attr_seq, $text]. | ||||
| 558 | |||||
| 559 | $p->handler(start => ""); | ||||
| 560 | |||||
| 561 | This causes 'start' events to be ignored. It also suppresses | ||||
| 562 | invocations of any default handler for start events. It is in most | ||||
| 563 | cases equivalent to $p->handler(start => sub {}), but is more | ||||
| 564 | efficient. It is different from the empty-sub-handler in that | ||||
| 565 | C<skipped_text> is not reset by it. | ||||
| 566 | |||||
| 567 | $p->handler(start => undef); | ||||
| 568 | |||||
| 569 | This causes no handler to be associated with start events. | ||||
| 570 | If there is a default handler it will be invoked. | ||||
| 571 | |||||
| 572 | =back | ||||
| 573 | |||||
| 574 | Filters based on tags can be set up to limit the number of events | ||||
| 575 | reported. The main bottleneck during parsing is often the huge number | ||||
| 576 | of callbacks made from the parser. Applying filters can improve | ||||
| 577 | performance significantly. | ||||
| 578 | |||||
| 579 | The following methods control filters: | ||||
| 580 | |||||
| 581 | =over | ||||
| 582 | |||||
| 583 | =item $p->ignore_elements( @tags ) | ||||
| 584 | |||||
| 585 | Both the C<start> event and the C<end> event as well as any events that | ||||
| 586 | would be reported in between are suppressed. The ignored elements can | ||||
| 587 | contain nested occurrences of itself. Example: | ||||
| 588 | |||||
| 589 | $p->ignore_elements(qw(script style)); | ||||
| 590 | |||||
| 591 | The C<script> and C<style> tags will always nest properly since their | ||||
| 592 | content is parsed in CDATA mode. For most other tags | ||||
| 593 | C<ignore_elements> must be used with caution since HTML is often not | ||||
| 594 | I<well formed>. | ||||
| 595 | |||||
| 596 | =item $p->ignore_tags( @tags ) | ||||
| 597 | |||||
| 598 | Any C<start> and C<end> events involving any of the tags given are | ||||
| 599 | suppressed. To reset the filter (i.e. don't suppress any C<start> and | ||||
| 600 | C<end> events), call C<ignore_tags> without an argument. | ||||
| 601 | |||||
| 602 | =item $p->report_tags( @tags ) | ||||
| 603 | |||||
| 604 | Any C<start> and C<end> events involving any of the tags I<not> given | ||||
| 605 | are suppressed. To reset the filter (i.e. report all C<start> and | ||||
| 606 | C<end> events), call C<report_tags> without an argument. | ||||
| 607 | |||||
| 608 | =back | ||||
| 609 | |||||
| 610 | Internally, the system has two filter lists, one for C<report_tags> | ||||
| 611 | and one for C<ignore_tags>, and both filters are applied. This | ||||
| 612 | effectively gives C<ignore_tags> precedence over C<report_tags>. | ||||
| 613 | |||||
| 614 | Examples: | ||||
| 615 | |||||
| 616 | $p->ignore_tags(qw(style)); | ||||
| 617 | $p->report_tags(qw(script style)); | ||||
| 618 | |||||
| 619 | results in only C<script> events being reported. | ||||
| 620 | |||||
| 621 | =head2 Argspec | ||||
| 622 | |||||
| 623 | Argspec is a string containing a comma-separated list that describes | ||||
| 624 | the information reported by the event. The following argspec | ||||
| 625 | identifier names can be used: | ||||
| 626 | |||||
| 627 | =over | ||||
| 628 | |||||
| 629 | =item C<attr> | ||||
| 630 | |||||
| 631 | Attr causes a reference to a hash of attribute name/value pairs to be | ||||
| 632 | passed. | ||||
| 633 | |||||
| 634 | Boolean attributes' values are either the value set by | ||||
| 635 | $p->boolean_attribute_value, or the attribute name if no value has been | ||||
| 636 | set by $p->boolean_attribute_value. | ||||
| 637 | |||||
| 638 | This passes undef except for C<start> events. | ||||
| 639 | |||||
| 640 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute | ||||
| 641 | names are forced to lower case. | ||||
| 642 | |||||
| 643 | General entities are decoded in the attribute values and | ||||
| 644 | one layer of matching quotes enclosing the attribute values is removed. | ||||
| 645 | |||||
| 646 | The Unicode character set is assumed for entity decoding. With Perl | ||||
| 647 | version 5.6 or earlier only the Latin-1 range is supported, and | ||||
| 648 | entities for characters outside the range 0..255 are left unchanged. | ||||
| 649 | |||||
| 650 | =item C<@attr> | ||||
| 651 | |||||
| 652 | Basically the same as C<attr>, but keys and values are passed as | ||||
| 653 | individual arguments and the original sequence of the attributes is | ||||
| 654 | kept. The parameters passed will be the same as the @attr calculated | ||||
| 655 | here: | ||||
| 656 | |||||
| 657 | @attr = map { $_ => $attr->{$_} } @$attrseq; | ||||
| 658 | |||||
| 659 | assuming $attr and $attrseq here are the hash and array passed as the | ||||
| 660 | result of C<attr> and C<attrseq> argspecs. | ||||
| 661 | |||||
| 662 | This passes no values for events besides C<start>. | ||||
| 663 | |||||
| 664 | =item C<attrseq> | ||||
| 665 | |||||
| 666 | Attrseq causes a reference to an array of attribute names to be | ||||
| 667 | passed. This can be useful if you want to walk the C<attr> hash in | ||||
| 668 | the original sequence. | ||||
| 669 | |||||
| 670 | This passes undef except for C<start> events. | ||||
| 671 | |||||
| 672 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute | ||||
| 673 | names are forced to lower case. | ||||
| 674 | |||||
| 675 | =item C<column> | ||||
| 676 | |||||
| 677 | Column causes the column number of the start of the event to be passed. | ||||
| 678 | The first column on a line is 0. | ||||
| 679 | |||||
| 680 | =item C<dtext> | ||||
| 681 | |||||
| 682 | Dtext causes the decoded text to be passed. General entities are | ||||
| 683 | automatically decoded unless the event was inside a CDATA section or | ||||
| 684 | was between literal start and end tags (C<script>, C<style>, | ||||
| 685 | C<xmp>, and C<plaintext>). | ||||
| 686 | |||||
| 687 | The Unicode character set is assumed for entity decoding. With Perl | ||||
| 688 | version 5.6 or earlier only the Latin-1 range is supported, and | ||||
| 689 | entities for characters outside the range 0..255 are left unchanged. | ||||
| 690 | |||||
| 691 | This passes undef except for C<text> events. | ||||
| 692 | |||||
| 693 | =item C<event> | ||||
| 694 | |||||
| 695 | Event causes the event name to be passed. | ||||
| 696 | |||||
| 697 | The event name is one of C<text>, C<start>, C<end>, C<declaration>, | ||||
| 698 | C<comment>, C<process>, C<start_document> or C<end_document>. | ||||
| 699 | |||||
| 700 | =item C<is_cdata> | ||||
| 701 | |||||
| 702 | Is_cdata causes a TRUE value to be passed if the event is inside a CDATA | ||||
| 703 | section or between literal start and end tags (C<script>, | ||||
| 704 | C<style>, C<xmp>, and C<plaintext>). | ||||
| 705 | |||||
| 706 | if the flag is FALSE for a text event, then you should normally | ||||
| 707 | either use C<dtext> or decode the entities yourself before the text is | ||||
| 708 | processed further. | ||||
| 709 | |||||
| 710 | =item C<length> | ||||
| 711 | |||||
| 712 | Length causes the number of bytes of the source text of the event to | ||||
| 713 | be passed. | ||||
| 714 | |||||
| 715 | =item C<line> | ||||
| 716 | |||||
| 717 | Line causes the line number of the start of the event to be passed. | ||||
| 718 | The first line in the document is 1. Line counting doesn't start | ||||
| 719 | until at least one handler requests this value to be reported. | ||||
| 720 | |||||
| 721 | =item C<offset> | ||||
| 722 | |||||
| 723 | Offset causes the byte position in the HTML document of the start of | ||||
| 724 | the event to be passed. The first byte in the document has offset 0. | ||||
| 725 | |||||
| 726 | =item C<offset_end> | ||||
| 727 | |||||
| 728 | Offset_end causes the byte position in the HTML document of the end of | ||||
| 729 | the event to be passed. This is the same as C<offset> + C<length>. | ||||
| 730 | |||||
| 731 | =item C<self> | ||||
| 732 | |||||
| 733 | Self causes the current object to be passed to the handler. If the | ||||
| 734 | handler is a method, this must be the first element in the argspec. | ||||
| 735 | |||||
| 736 | An alternative to passing self as an argspec is to register closures | ||||
| 737 | that capture $self by themselves as handlers. Unfortunately this | ||||
| 738 | creates circular references which prevent the HTML::Parser object | ||||
| 739 | from being garbage collected. Using the C<self> argspec avoids this | ||||
| 740 | problem. | ||||
| 741 | |||||
| 742 | =item C<skipped_text> | ||||
| 743 | |||||
| 744 | Skipped_text returns the concatenated text of all the events that have | ||||
| 745 | been skipped since the last time an event was reported. Events might | ||||
| 746 | be skipped because no handler is registered for them or because some | ||||
| 747 | filter applies. Skipped text also includes marked section markup, | ||||
| 748 | since there are no events that can catch it. | ||||
| 749 | |||||
| 750 | If an C<"">-handler is registered for an event, then the text for this | ||||
| 751 | event is not included in C<skipped_text>. Skipped text both before | ||||
| 752 | and after the C<"">-event is included in the next reported | ||||
| 753 | C<skipped_text>. | ||||
| 754 | |||||
| 755 | =item C<tag> | ||||
| 756 | |||||
| 757 | Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> | ||||
| 758 | event and "!" for a declaration. The C<tag> does not have any prefix | ||||
| 759 | for C<start> events, and is in this case identical to C<tagname>. | ||||
| 760 | |||||
| 761 | =item C<tagname> | ||||
| 762 | |||||
| 763 | This is the element name (or I<generic identifier> in SGML jargon) for | ||||
| 764 | start and end tags. Since HTML is case insensitive, this name is | ||||
| 765 | forced to lower case to ease string matching. | ||||
| 766 | |||||
| 767 | Since XML is case sensitive, the tagname case is not changed when | ||||
| 768 | C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute | ||||
| 769 | is set. | ||||
| 770 | |||||
| 771 | The declaration type of declaration elements is also passed as a tagname, | ||||
| 772 | even if that is a bit strange. | ||||
| 773 | In fact, in the current implementation tagname is | ||||
| 774 | identical to C<token0> except that the name may be forced to lower case. | ||||
| 775 | |||||
| 776 | =item C<token0> | ||||
| 777 | |||||
| 778 | Token0 causes the original text of the first token string to be | ||||
| 779 | passed. This should always be the same as $tokens->[0]. | ||||
| 780 | |||||
| 781 | For C<declaration> events, this is the declaration type. | ||||
| 782 | |||||
| 783 | For C<start> and C<end> events, this is the tag name. | ||||
| 784 | |||||
| 785 | For C<process> and non-strict C<comment> events, this is everything | ||||
| 786 | inside the tag. | ||||
| 787 | |||||
| 788 | This passes undef if there are no tokens in the event. | ||||
| 789 | |||||
| 790 | =item C<tokenpos> | ||||
| 791 | |||||
| 792 | Tokenpos causes a reference to an array of token positions to be | ||||
| 793 | passed. For each string that appears in C<tokens>, this array | ||||
| 794 | contains two numbers. The first number is the offset of the start of | ||||
| 795 | the token in the original C<text> and the second number is the length | ||||
| 796 | of the token. | ||||
| 797 | |||||
| 798 | Boolean attributes in a C<start> event will have (0,0) for the | ||||
| 799 | attribute value offset and length. | ||||
| 800 | |||||
| 801 | This passes undef if there are no tokens in the event (e.g., C<text>) | ||||
| 802 | and for artificial C<end> events triggered by empty element tags. | ||||
| 803 | |||||
| 804 | If you are using these offsets and lengths to modify C<text>, you | ||||
| 805 | should either work from right to left, or be very careful to calculate | ||||
| 806 | the changes to the offsets. | ||||
| 807 | |||||
| 808 | =item C<tokens> | ||||
| 809 | |||||
| 810 | Tokens causes a reference to an array of token strings to be passed. | ||||
| 811 | The strings are exactly as they were found in the original text, | ||||
| 812 | no decoding or case changes are applied. | ||||
| 813 | |||||
| 814 | For C<declaration> events, the array contains each word, comment, and | ||||
| 815 | delimited string starting with the declaration type. | ||||
| 816 | |||||
| 817 | For C<comment> events, this contains each sub-comment. If | ||||
| 818 | $p->strict_comments is disabled, there will be only one sub-comment. | ||||
| 819 | |||||
| 820 | For C<start> events, this contains the original tag name followed by | ||||
| 821 | the attribute name/value pairs. The values of boolean attributes will | ||||
| 822 | be either the value set by $p->boolean_attribute_value, or the | ||||
| 823 | attribute name if no value has been set by | ||||
| 824 | $p->boolean_attribute_value. | ||||
| 825 | |||||
| 826 | For C<end> events, this contains the original tag name (always one token). | ||||
| 827 | |||||
| 828 | For C<process> events, this contains the process instructions (always one | ||||
| 829 | token). | ||||
| 830 | |||||
| 831 | This passes C<undef> for C<text> events. | ||||
| 832 | |||||
| 833 | =item C<text> | ||||
| 834 | |||||
| 835 | Text causes the source text (including markup element delimiters) to be | ||||
| 836 | passed. | ||||
| 837 | |||||
| 838 | =item C<undef> | ||||
| 839 | |||||
| 840 | Pass an undefined value. Useful as padding where the same handler | ||||
| 841 | routine is registered for multiple events. | ||||
| 842 | |||||
| 843 | =item C<'...'> | ||||
| 844 | |||||
| 845 | A literal string of 0 to 255 characters enclosed | ||||
| 846 | in single (') or double (") quotes is passed as entered. | ||||
| 847 | |||||
| 848 | =back | ||||
| 849 | |||||
| 850 | The whole argspec string can be wrapped up in C<'@{...}'> to signal | ||||
| 851 | that the resulting event array should be flattened. This only makes a | ||||
| 852 | difference if an array reference is used as the handler target. | ||||
| 853 | Consider this example: | ||||
| 854 | |||||
| 855 | $p->handler(text => [], 'text'); | ||||
| 856 | $p->handler(text => [], '@{text}']); | ||||
| 857 | |||||
| 858 | With two text events; C<"foo">, C<"bar">; then the first example will end | ||||
| 859 | up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in | ||||
| 860 | the handler target array. | ||||
| 861 | |||||
| 862 | |||||
| 863 | =head2 Events | ||||
| 864 | |||||
| 865 | Handlers for the following events can be registered: | ||||
| 866 | |||||
| 867 | =over | ||||
| 868 | |||||
| 869 | =item C<comment> | ||||
| 870 | |||||
| 871 | This event is triggered when a markup comment is recognized. | ||||
| 872 | |||||
| 873 | Example: | ||||
| 874 | |||||
| 875 | <!-- This is a comment -- -- So is this --> | ||||
| 876 | |||||
| 877 | =item C<declaration> | ||||
| 878 | |||||
| 879 | This event is triggered when a I<markup declaration> is recognized. | ||||
| 880 | |||||
| 881 | For typical HTML documents, the only declaration you are | ||||
| 882 | likely to find is <!DOCTYPE ...>. | ||||
| 883 | |||||
| 884 | Example: | ||||
| 885 | |||||
| 886 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | ||||
| 887 | "http://www.w3.org/TR/html40/strict.dtd"> | ||||
| 888 | |||||
| 889 | DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. | ||||
| 890 | |||||
| 891 | =item C<default> | ||||
| 892 | |||||
| 893 | This event is triggered for events that do not have a specific | ||||
| 894 | handler. You can set up a handler for this event to catch stuff you | ||||
| 895 | did not want to catch explicitly. | ||||
| 896 | |||||
| 897 | =item C<end> | ||||
| 898 | |||||
| 899 | This event is triggered when an end tag is recognized. | ||||
| 900 | |||||
| 901 | Example: | ||||
| 902 | |||||
| 903 | </A> | ||||
| 904 | |||||
| 905 | =item C<end_document> | ||||
| 906 | |||||
| 907 | This event is triggered when $p->eof is called and after any remaining | ||||
| 908 | text is flushed. There is no document text associated with this event. | ||||
| 909 | |||||
| 910 | =item C<process> | ||||
| 911 | |||||
| 912 | This event is triggered when a processing instructions markup is | ||||
| 913 | recognized. | ||||
| 914 | |||||
| 915 | The format and content of processing instructions are system and | ||||
| 916 | application dependent. | ||||
| 917 | |||||
| 918 | Examples: | ||||
| 919 | |||||
| 920 | <? HTML processing instructions > | ||||
| 921 | <? XML processing instructions ?> | ||||
| 922 | |||||
| 923 | =item C<start> | ||||
| 924 | |||||
| 925 | This event is triggered when a start tag is recognized. | ||||
| 926 | |||||
| 927 | Example: | ||||
| 928 | |||||
| 929 | <A HREF="http://www.perl.com/"> | ||||
| 930 | |||||
| 931 | =item C<start_document> | ||||
| 932 | |||||
| 933 | This event is triggered before any other events for a new document. A | ||||
| 934 | handler for it can be used to initialize stuff. There is no document | ||||
| 935 | text associated with this event. | ||||
| 936 | |||||
| 937 | =item C<text> | ||||
| 938 | |||||
| 939 | This event is triggered when plain text (characters) is recognized. | ||||
| 940 | The text may contain multiple lines. A sequence of text may be broken | ||||
| 941 | between several text events unless $p->unbroken_text is enabled. | ||||
| 942 | |||||
| 943 | The parser will make sure that it does not break a word or a sequence | ||||
| 944 | of whitespace between two text events. | ||||
| 945 | |||||
| 946 | =back | ||||
| 947 | |||||
| 948 | =head2 Unicode | ||||
| 949 | |||||
| 950 | The C<HTML::Parser> can parse Unicode strings when running under | ||||
| 951 | perl-5.8 or better. If Unicode is passed to $p->parse() then chunks | ||||
| 952 | of Unicode will be reported to the handlers. The offset and length | ||||
| 953 | argspecs will also report their position in terms of characters. | ||||
| 954 | |||||
| 955 | It is safe to parse raw undecoded UTF-8 if you either avoid decoding | ||||
| 956 | entities and make sure to not use I<argspecs> that do, or enable the | ||||
| 957 | C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be | ||||
| 958 | useful when parsing from a file where you need the reported offsets | ||||
| 959 | and lengths to match the byte offsets in the file. | ||||
| 960 | |||||
| 961 | If a filename is passed to $p->parse_file() then the file will be read | ||||
| 962 | in binary mode. This will be fine if the file contains only ASCII or | ||||
| 963 | Latin-1 characters. If the file contains UTF-8 encoded text then care | ||||
| 964 | must be taken when decoding entities as described in the previous | ||||
| 965 | paragraph, but better is to open the file with the UTF-8 layer so that | ||||
| 966 | it is decoded properly: | ||||
| 967 | |||||
| 968 | open(my $fh, "<:utf8", "index.html") || die "...: $!"; | ||||
| 969 | $p->parse_file($fh); | ||||
| 970 | |||||
| 971 | If the file contains text encoded in a charset besides ASCII, Latin-1 | ||||
| 972 | or UTF-8 then decoding will always be needed. | ||||
| 973 | |||||
| 974 | =head1 VERSION 2 COMPATIBILITY | ||||
| 975 | |||||
| 976 | When an C<HTML::Parser> object is constructed with no arguments, a set | ||||
| 977 | of handlers is automatically provided that is compatible with the old | ||||
| 978 | HTML::Parser version 2 callback methods. | ||||
| 979 | |||||
| 980 | This is equivalent to the following method calls: | ||||
| 981 | |||||
| 982 | $p->handler(start => "start", "self, tagname, attr, attrseq, text"); | ||||
| 983 | $p->handler(end => "end", "self, tagname, text"); | ||||
| 984 | $p->handler(text => "text", "self, text, is_cdata"); | ||||
| 985 | $p->handler(process => "process", "self, token0, text"); | ||||
| 986 | $p->handler(comment => | ||||
| 987 | sub { | ||||
| 988 | my($self, $tokens) = @_; | ||||
| 989 | for (@$tokens) {$self->comment($_);}}, | ||||
| 990 | "self, tokens"); | ||||
| 991 | $p->handler(declaration => | ||||
| 992 | sub { | ||||
| 993 | my $self = shift; | ||||
| 994 | $self->declaration(substr($_[0], 2, -1));}, | ||||
| 995 | "self, text"); | ||||
| 996 | |||||
| 997 | Setting up these handlers can also be requested with the "api_version => | ||||
| 998 | 2" constructor option. | ||||
| 999 | |||||
| 1000 | =head1 SUBCLASSING | ||||
| 1001 | |||||
| 1002 | The C<HTML::Parser> class is subclassable. Parser objects are plain | ||||
| 1003 | hashes and C<HTML::Parser> reserves only hash keys that start with | ||||
| 1004 | "_hparser". The parser state can be set up by invoking the init() | ||||
| 1005 | method, which takes the same arguments as new(). | ||||
| 1006 | |||||
| 1007 | =head1 EXAMPLES | ||||
| 1008 | |||||
| 1009 | The first simple example shows how you might strip out comments from | ||||
| 1010 | an HTML document. We achieve this by setting up a comment handler that | ||||
| 1011 | does nothing and a default handler that will print out anything else: | ||||
| 1012 | |||||
| 1013 | use HTML::Parser; | ||||
| 1014 | HTML::Parser->new(default_h => [sub { print shift }, 'text'], | ||||
| 1015 | comment_h => [""], | ||||
| 1016 | )->parse_file(shift || die) || die $!; | ||||
| 1017 | |||||
| 1018 | An alternative implementation is: | ||||
| 1019 | |||||
| 1020 | use HTML::Parser; | ||||
| 1021 | HTML::Parser->new(end_document_h => [sub { print shift }, | ||||
| 1022 | 'skipped_text'], | ||||
| 1023 | comment_h => [""], | ||||
| 1024 | )->parse_file(shift || die) || die $!; | ||||
| 1025 | |||||
| 1026 | This will in most cases be much more efficient since only a single | ||||
| 1027 | callback will be made. | ||||
| 1028 | |||||
| 1029 | The next example prints out the text that is inside the <title> | ||||
| 1030 | element of an HTML document. Here we start by setting up a start | ||||
| 1031 | handler. When it sees the title start tag it enables a text handler | ||||
| 1032 | that prints any text found and an end handler that will terminate | ||||
| 1033 | parsing as soon as the title end tag is seen: | ||||
| 1034 | |||||
| 1035 | use HTML::Parser (); | ||||
| 1036 | |||||
| 1037 | sub start_handler | ||||
| 1038 | { | ||||
| 1039 | return if shift ne "title"; | ||||
| 1040 | my $self = shift; | ||||
| 1041 | $self->handler(text => sub { print shift }, "dtext"); | ||||
| 1042 | $self->handler(end => sub { shift->eof if shift eq "title"; }, | ||||
| 1043 | "tagname,self"); | ||||
| 1044 | } | ||||
| 1045 | |||||
| 1046 | my $p = HTML::Parser->new(api_version => 3); | ||||
| 1047 | $p->handler( start => \&start_handler, "tagname,self"); | ||||
| 1048 | $p->parse_file(shift || die) || die $!; | ||||
| 1049 | print "\n"; | ||||
| 1050 | |||||
| 1051 | More examples are found in the F<eg/> directory of the C<HTML-Parser> | ||||
| 1052 | distribution: the program C<hrefsub> shows how you can edit all links | ||||
| 1053 | found in a document; the program C<htextsub> shows how to edit the text only; the | ||||
| 1054 | program C<hstrip> shows how you can strip out certain tags/elements | ||||
| 1055 | and/or attributes; and the program C<htext> show how to obtain the | ||||
| 1056 | plain text, but not any script/style content. | ||||
| 1057 | |||||
| 1058 | You can browse the F<eg/> directory online from the I<[Browse]> link on | ||||
| 1059 | the http://search.cpan.org/~gaas/HTML-Parser/ page. | ||||
| 1060 | |||||
| 1061 | =head1 BUGS | ||||
| 1062 | |||||
| 1063 | The <style> and <script> sections do not end with the first "</", but | ||||
| 1064 | need the complete corresponding end tag. The standard behaviour is | ||||
| 1065 | not really practical. | ||||
| 1066 | |||||
| 1067 | When the I<strict_comment> option is enabled, we still recognize | ||||
| 1068 | comments where there is something other than whitespace between even | ||||
| 1069 | and odd "--" markers. | ||||
| 1070 | |||||
| 1071 | Once $p->boolean_attribute_value has been set, there is no way to | ||||
| 1072 | restore the default behaviour. | ||||
| 1073 | |||||
| 1074 | There is currently no way to get both quote characters | ||||
| 1075 | into the same literal argspec. | ||||
| 1076 | |||||
| 1077 | Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them | ||||
| 1078 | to repeat the previous start tag or close the previous start tag | ||||
| 1079 | respectively. | ||||
| 1080 | |||||
| 1081 | NET tags, e.g. "code/.../" are not recognized. This is SGML | ||||
| 1082 | shorthand for "<code>...</code>". | ||||
| 1083 | |||||
| 1084 | Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not | ||||
| 1085 | recognized. | ||||
| 1086 | |||||
| 1087 | =head1 DIAGNOSTICS | ||||
| 1088 | |||||
| 1089 | The following messages may be produced by HTML::Parser. The notation | ||||
| 1090 | in this listing is the same as used in L<perldiag>: | ||||
| 1091 | |||||
| 1092 | =over | ||||
| 1093 | |||||
| 1094 | =item Not a reference to a hash | ||||
| 1095 | |||||
| 1096 | (F) The object blessed into or subclassed from HTML::Parser is not a | ||||
| 1097 | hash as required by the HTML::Parser methods. | ||||
| 1098 | |||||
| 1099 | =item Bad signature in parser state object at %p | ||||
| 1100 | |||||
| 1101 | (F) The _hparser_xs_state element does not refer to a valid state structure. | ||||
| 1102 | Something must have changed the internal value | ||||
| 1103 | stored in this hash element, or the memory has been overwritten. | ||||
| 1104 | |||||
| 1105 | =item _hparser_xs_state element is not a reference | ||||
| 1106 | |||||
| 1107 | (F) The _hparser_xs_state element has been destroyed. | ||||
| 1108 | |||||
| 1109 | =item Can't find '_hparser_xs_state' element in HTML::Parser hash | ||||
| 1110 | |||||
| 1111 | (F) The _hparser_xs_state element is missing from the parser hash. | ||||
| 1112 | It was either deleted, or not created when the object was created. | ||||
| 1113 | |||||
| 1114 | =item API version %s not supported by HTML::Parser %s | ||||
| 1115 | |||||
| 1116 | (F) The constructor option 'api_version' with an argument greater than | ||||
| 1117 | or equal to 4 is reserved for future extensions. | ||||
| 1118 | |||||
| 1119 | =item Bad constructor option '%s' | ||||
| 1120 | |||||
| 1121 | (F) An unknown constructor option key was passed to the new() or | ||||
| 1122 | init() methods. | ||||
| 1123 | |||||
| 1124 | =item Parse loop not allowed | ||||
| 1125 | |||||
| 1126 | (F) A handler invoked the parse() or parse_file() method. | ||||
| 1127 | This is not permitted. | ||||
| 1128 | |||||
| 1129 | =item marked sections not supported | ||||
| 1130 | |||||
| 1131 | (F) The $p->marked_sections() method was invoked in a HTML::Parser | ||||
| 1132 | module that was compiled without support for marked sections. | ||||
| 1133 | |||||
| 1134 | =item Unknown boolean attribute (%d) | ||||
| 1135 | |||||
| 1136 | (F) Something is wrong with the internal logic that set up aliases for | ||||
| 1137 | boolean attributes. | ||||
| 1138 | |||||
| 1139 | =item Only code or array references allowed as handler | ||||
| 1140 | |||||
| 1141 | (F) The second argument for $p->handler must be either a subroutine | ||||
| 1142 | reference, then name of a subroutine or method, or a reference to an | ||||
| 1143 | array. | ||||
| 1144 | |||||
| 1145 | =item No handler for %s events | ||||
| 1146 | |||||
| 1147 | (F) The first argument to $p->handler must be a valid event name; i.e. one | ||||
| 1148 | of "start", "end", "text", "process", "declaration" or "comment". | ||||
| 1149 | |||||
| 1150 | =item Unrecognized identifier %s in argspec | ||||
| 1151 | |||||
| 1152 | (F) The identifier is not a known argspec name. | ||||
| 1153 | Use one of the names mentioned in the argspec section above. | ||||
| 1154 | |||||
| 1155 | =item Literal string is longer than 255 chars in argspec | ||||
| 1156 | |||||
| 1157 | (F) The current implementation limits the length of literals in | ||||
| 1158 | an argspec to 255 characters. Make the literal shorter. | ||||
| 1159 | |||||
| 1160 | =item Backslash reserved for literal string in argspec | ||||
| 1161 | |||||
| 1162 | (F) The backslash character "\" is not allowed in argspec literals. | ||||
| 1163 | It is reserved to permit quoting inside a literal in a later version. | ||||
| 1164 | |||||
| 1165 | =item Unterminated literal string in argspec | ||||
| 1166 | |||||
| 1167 | (F) The terminating quote character for a literal was not found. | ||||
| 1168 | |||||
| 1169 | =item Bad argspec (%s) | ||||
| 1170 | |||||
| 1171 | (F) Only identifier names, literals, spaces and commas | ||||
| 1172 | are allowed in argspecs. | ||||
| 1173 | |||||
| 1174 | =item Missing comma separator in argspec | ||||
| 1175 | |||||
| 1176 | (F) Identifiers in an argspec must be separated with ",". | ||||
| 1177 | |||||
| 1178 | =item Parsing of undecoded UTF-8 will give garbage when decoding entities | ||||
| 1179 | |||||
| 1180 | (W) The first chunk parsed appears to contain undecoded UTF-8 and one | ||||
| 1181 | or more argspecs that decode entities are used for the callback | ||||
| 1182 | handlers. | ||||
| 1183 | |||||
| 1184 | The result of decoding will be a mix of encoded and decoded characters | ||||
| 1185 | for any entities that expand to characters with code above 127. This | ||||
| 1186 | is not a good thing. | ||||
| 1187 | |||||
| 1188 | The solution is to use the Encode::encode_utf8() on the data before | ||||
| 1189 | feeding it to the $p->parse(). For $p->parse_file() pass a file that | ||||
| 1190 | has been opened in ":utf8" mode. | ||||
| 1191 | |||||
| 1192 | The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode> | ||||
| 1193 | is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided. | ||||
| 1194 | |||||
| 1195 | =item Parsing string decoded with wrong endianess | ||||
| 1196 | |||||
| 1197 | (W) The first character in the document is U+FFFE. This is not a | ||||
| 1198 | legal Unicode character but a byte swapped BOM. The result of parsing | ||||
| 1199 | will likely be garbage. | ||||
| 1200 | |||||
| 1201 | =item Parsing of undecoded UTF-32 | ||||
| 1202 | |||||
| 1203 | (W) The parser found the Unicode UTF-32 BOM signature at the start | ||||
| 1204 | of the document. The result of parsing will likely be garbage. | ||||
| 1205 | |||||
| 1206 | =item Parsing of undecoded UTF-16 | ||||
| 1207 | |||||
| 1208 | (W) The parser found the Unicode UTF-16 BOM signature at the start of | ||||
| 1209 | the document. The result of parsing will likely be garbage. | ||||
| 1210 | |||||
| 1211 | =back | ||||
| 1212 | |||||
| 1213 | =head1 SEE ALSO | ||||
| 1214 | |||||
| 1215 | L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, | ||||
| 1216 | L<HTML::LinkExtor>, L<HTML::Form> | ||||
| 1217 | |||||
| 1218 | L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) | ||||
| 1219 | |||||
| 1220 | http://www.w3.org/TR/html4 | ||||
| 1221 | |||||
| 1222 | More information about marked sections and processing instructions may | ||||
| 1223 | be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>. | ||||
| 1224 | |||||
| 1225 | =head1 COPYRIGHT | ||||
| 1226 | |||||
| 1227 | Copyright 1996-2007 Gisle Aas. All rights reserved. | ||||
| 1228 | Copyright 1999-2000 Michael A. Chase. All rights reserved. | ||||
| 1229 | |||||
| 1230 | This library is free software; you can redistribute it and/or | ||||
| 1231 | modify it under the same terms as Perl itself. | ||||
| 1232 | |||||
| 1233 | =cut | ||||
# spent 6.65ms within HTML::Parser::CORE:match which was called 922 times, avg 7µs/call:
# 922 times (6.65ms+0s) by HTML::Parser::init at line 72 of HTML/Parser.pm, avg 7µs/call | |||||
# spent 7.42ms within HTML::Parser::_alloc_pstate which was called 461 times, avg 16µs/call:
# 461 times (7.42ms+0s) by HTML::Parser::init at line 30 of HTML/Parser.pm, avg 16µs/call | |||||
# spent 48µs within HTML::Parser::eof which was called 7 times, avg 7µs/call:
# 7 times (48µs+0s) by HTML::HeadParser::text at line 216 of HTML/HeadParser.pm, avg 7µs/call | |||||
# spent 25.5ms within HTML::Parser::handler which was called 2766 times, avg 9µs/call:
# 461 times (5.89ms+0s) by HTML::Parser::init at line 42 of HTML/Parser.pm, avg 13µs/call
# 461 times (4.47ms+0s) by HTML::Parser::init at line 45 of HTML/Parser.pm, avg 10µs/call
# 461 times (4.13ms+0s) by HTML::Parser::init at line 43 of HTML/Parser.pm, avg 9µs/call
# 461 times (3.98ms+0s) by HTML::Parser::init at line 44 of HTML/Parser.pm, avg 9µs/call
# 461 times (3.52ms+0s) by HTML::Parser::init at line 54 of HTML/Parser.pm, avg 8µs/call
# 461 times (3.48ms+0s) by HTML::Parser::init at line 60 of HTML/Parser.pm, avg 8µs/call | |||||
# spent 5.15ms within HTML::Parser::ignore_elements which was called 461 times, avg 11µs/call:
# 461 times (5.15ms+0s) by HTML::Parser::init at line 80 of HTML/Parser.pm, avg 11µs/call | |||||
# spent 986µs (283+703) within HTML::Parser::parse which was called 7 times, avg 141µs/call:
# 7 times (283µs+703µs) by LWP::Protocol::collect at line 114 of LWP/Protocol.pm, avg 141µs/call |