| File | /project/perl/lib/HTML/HeadParser.pm |
| Statements Executed | 3289 |
| Statement Execution Time | 38.6ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 461 | 1 | 1 | 34.9ms | 195ms | HTML::HeadParser::new |
| 7 | 1 | 1 | 543µs | 703µs | HTML::HeadParser::text |
| 8 | 2 | 2 | 83µs | 83µs | HTML::HeadParser::CORE:match (opcode) |
| 7 | 1 | 2 | 49µs | 49µs | HTML::HeadParser::CORE:subst (opcode) |
| 0 | 0 | 0 | 0s | 0s | HTML::HeadParser::BEGIN |
| 0 | 0 | 0 | 0s | 0s | HTML::HeadParser::as_string |
| 0 | 0 | 0 | 0s | 0s | HTML::HeadParser::end |
| 0 | 0 | 0 | 0s | 0s | HTML::HeadParser::flush_text |
| 0 | 0 | 0 | 0s | 0s | HTML::HeadParser::header |
| 0 | 0 | 0 | 0s | 0s | HTML::HeadParser::start |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package HTML::HeadParser; | ||||
| 2 | |||||
| 3 | =head1 NAME | ||||
| 4 | |||||
| 5 | HTML::HeadParser - Parse <HEAD> section of a HTML document | ||||
| 6 | |||||
| 7 | =head1 SYNOPSIS | ||||
| 8 | |||||
| 9 | require HTML::HeadParser; | ||||
| 10 | $p = HTML::HeadParser->new; | ||||
| 11 | $p->parse($text) and print "not finished"; | ||||
| 12 | |||||
| 13 | $p->header('Title') # to access <title>....</title> | ||||
| 14 | $p->header('Content-Base') # to access <base href="http://..."> | ||||
| 15 | $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> | ||||
| 16 | |||||
| 17 | =head1 DESCRIPTION | ||||
| 18 | |||||
| 19 | The C<HTML::HeadParser> is a specialized (and lightweight) | ||||
| 20 | C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> | ||||
| 21 | section of an HTML document. The parse() method | ||||
| 22 | will return a FALSE value as soon as some E<lt>BODY> element or body | ||||
| 23 | text are found, and should not be called again after this. | ||||
| 24 | |||||
| 25 | Note that the C<HTML::HeadParser> might get confused if raw undecoded | ||||
| 26 | UTF-8 is passed to the parse() method. Make sure the strings are | ||||
| 27 | properly decoded before passing them on. | ||||
| 28 | |||||
| 29 | The C<HTML::HeadParser> keeps a reference to a header object, and the | ||||
| 30 | parser will update this header object as the various elements of the | ||||
| 31 | E<lt>HEAD> section of the HTML document are recognized. The following | ||||
| 32 | header fields are affected: | ||||
| 33 | |||||
| 34 | =over 4 | ||||
| 35 | |||||
| 36 | =item Content-Base: | ||||
| 37 | |||||
| 38 | The I<Content-Base> header is initialized from the E<lt>base | ||||
| 39 | href="..."> element. | ||||
| 40 | |||||
| 41 | =item Title: | ||||
| 42 | |||||
| 43 | The I<Title> header is initialized from the E<lt>title>...E<lt>/title> | ||||
| 44 | element. | ||||
| 45 | |||||
| 46 | =item Isindex: | ||||
| 47 | |||||
| 48 | The I<Isindex> header will be added if there is a E<lt>isindex> | ||||
| 49 | element in the E<lt>head>. The header value is initialized from the | ||||
| 50 | I<prompt> attribute if it is present. If no I<prompt> attribute is | ||||
| 51 | given it will have '?' as the value. | ||||
| 52 | |||||
| 53 | =item X-Meta-Foo: | ||||
| 54 | |||||
| 55 | All E<lt>meta> elements will initialize headers with the prefix | ||||
| 56 | "C<X-Meta->" on the name. If the E<lt>meta> element contains a | ||||
| 57 | C<http-equiv> attribute, then it will be honored as the header name. | ||||
| 58 | |||||
| 59 | =back | ||||
| 60 | |||||
| 61 | =head1 METHODS | ||||
| 62 | |||||
| 63 | The following methods (in addition to those provided by the | ||||
| 64 | superclass) are available: | ||||
| 65 | |||||
| 66 | =over 4 | ||||
| 67 | |||||
| 68 | =cut | ||||
| 69 | |||||
| 70 | |||||
| 71 | 1 | 6µs | require HTML::Parser; | ||
| 72 | 1 | 10µs | @ISA = qw(HTML::Parser); | ||
| 73 | |||||
| 74 | 3 | 660µs | use HTML::Entities (); | ||
| 75 | |||||
| 76 | 3 | 102µs | 1 | 26µs | use strict; # spent 26µs making 1 call to strict::import |
| 77 | 3 | 1.60ms | 1 | 210µs | use vars qw($VERSION $DEBUG); # spent 210µs making 1 call to vars::import |
| 78 | #$DEBUG = 1; | ||||
| 79 | 1 | 62µs | 1 | 20µs | $VERSION = sprintf("%d.%02d", q$Revision: 2.22 $ =~ /(\d+)\.(\d+)/); # spent 20µs making 1 call to HTML::HeadParser::CORE:match |
| 80 | |||||
| 81 | =item $hp = HTML::HeadParser->new | ||||
| 82 | |||||
| 83 | =item $hp = HTML::HeadParser->new( $header ) | ||||
| 84 | |||||
| 85 | The object constructor. The optional $header argument should be a | ||||
| 86 | reference to an object that implement the header() and push_header() | ||||
| 87 | methods as defined by the C<HTTP::Headers> class. Normally it will be | ||||
| 88 | of some class that isa or delegates to the C<HTTP::Headers> class. | ||||
| 89 | |||||
| 90 | If no $header is given C<HTML::HeadParser> will create an | ||||
| 91 | C<HTTP::Header> object by itself (initially empty). | ||||
| 92 | |||||
| 93 | =cut | ||||
| 94 | |||||
| 95 | sub new | ||||
| 96 | # spent 195ms (34.9+160) within HTML::HeadParser::new which was called 461 times, avg 423µs/call:
# 461 times (34.9ms+160ms) by LWP::Protocol::collect at line 106 of LWP/Protocol.pm, avg 423µs/call | ||||
| 97 | 3227 | 35.4ms | my($class, $header) = @_; | ||
| 98 | unless ($header) { | ||||
| 99 | require HTTP::Headers; | ||||
| 100 | $header = HTTP::Headers->new; | ||||
| 101 | } | ||||
| 102 | |||||
| 103 | my $self = $class->SUPER::new(api_version => 2, # spent 160ms making 461 calls to HTML::Parser::new, avg 347µs/call | ||||
| 104 | ignore_elements => [qw(script style)], | ||||
| 105 | ); | ||||
| 106 | $self->{'header'} = $header; | ||||
| 107 | $self->{'tag'} = ''; # name of active element that takes textual content | ||||
| 108 | $self->{'text'} = ''; # the accumulated text associated with the element | ||||
| 109 | $self; | ||||
| 110 | } | ||||
| 111 | |||||
| 112 | =item $hp->header; | ||||
| 113 | |||||
| 114 | Returns a reference to the header object. | ||||
| 115 | |||||
| 116 | =item $hp->header( $key ) | ||||
| 117 | |||||
| 118 | Returns a header value. It is just a shorter way to write | ||||
| 119 | C<$hp-E<gt>header-E<gt>header($key)>. | ||||
| 120 | |||||
| 121 | =cut | ||||
| 122 | |||||
| 123 | sub header | ||||
| 124 | { | ||||
| 125 | my $self = shift; | ||||
| 126 | return $self->{'header'} unless @_; | ||||
| 127 | $self->{'header'}->header(@_); | ||||
| 128 | } | ||||
| 129 | |||||
| 130 | sub as_string # legacy | ||||
| 131 | { | ||||
| 132 | my $self = shift; | ||||
| 133 | $self->{'header'}->as_string; | ||||
| 134 | } | ||||
| 135 | |||||
| 136 | sub flush_text # internal | ||||
| 137 | { | ||||
| 138 | my $self = shift; | ||||
| 139 | my $tag = $self->{'tag'}; | ||||
| 140 | my $text = $self->{'text'}; | ||||
| 141 | $text =~ s/^\s+//; | ||||
| 142 | $text =~ s/\s+$//; | ||||
| 143 | $text =~ s/\s+/ /g; | ||||
| 144 | print "FLUSH $tag => '$text'\n" if $DEBUG; | ||||
| 145 | if ($tag eq 'title') { | ||||
| 146 | HTML::Entities::decode($text); | ||||
| 147 | $self->{'header'}->push_header(Title => $text); | ||||
| 148 | } | ||||
| 149 | $self->{'tag'} = $self->{'text'} = ''; | ||||
| 150 | } | ||||
| 151 | |||||
| 152 | # This is an quote from the HTML3.2 DTD which shows which elements | ||||
| 153 | # that might be present in a <HEAD>...</HEAD>. Also note that the | ||||
| 154 | # <HEAD> tags themselves might be missing: | ||||
| 155 | # | ||||
| 156 | # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & | ||||
| 157 | # SCRIPT* & META* & LINK*"> | ||||
| 158 | # | ||||
| 159 | # <!ELEMENT HEAD O O (%head.content)> | ||||
| 160 | |||||
| 161 | |||||
| 162 | sub start | ||||
| 163 | { | ||||
| 164 | my($self, $tag, $attr) = @_; # $attr is reference to a HASH | ||||
| 165 | print "START[$tag]\n" if $DEBUG; | ||||
| 166 | $self->flush_text if $self->{'tag'}; | ||||
| 167 | if ($tag eq 'meta') { | ||||
| 168 | my $key = $attr->{'http-equiv'}; | ||||
| 169 | if (!defined($key) || !length($key)) { | ||||
| 170 | return unless $attr->{'name'}; | ||||
| 171 | $key = "X-Meta-\u$attr->{'name'}"; | ||||
| 172 | } | ||||
| 173 | $self->{'header'}->push_header($key => $attr->{content}); | ||||
| 174 | } elsif ($tag eq 'base') { | ||||
| 175 | return unless exists $attr->{href}; | ||||
| 176 | $self->{'header'}->push_header('Content-Base' => $attr->{href}); | ||||
| 177 | } elsif ($tag eq 'isindex') { | ||||
| 178 | # This is a non-standard header. Perhaps we should just ignore | ||||
| 179 | # this element | ||||
| 180 | $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?'); | ||||
| 181 | } elsif ($tag =~ /^(?:title|script|style)$/) { | ||||
| 182 | # Just remember tag. Initialize header when we see the end tag. | ||||
| 183 | $self->{'tag'} = $tag; | ||||
| 184 | } elsif ($tag eq 'link') { | ||||
| 185 | return unless exists $attr->{href}; | ||||
| 186 | # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> | ||||
| 187 | my $h_val = "<" . delete($attr->{href}) . ">"; | ||||
| 188 | for (sort keys %{$attr}) { | ||||
| 189 | $h_val .= qq(; $_="$attr->{$_}"); | ||||
| 190 | } | ||||
| 191 | $self->{'header'}->push_header(Link => $h_val); | ||||
| 192 | } elsif ($tag eq 'head' || $tag eq 'html') { | ||||
| 193 | # ignore | ||||
| 194 | } else { | ||||
| 195 | # stop parsing | ||||
| 196 | $self->eof; | ||||
| 197 | } | ||||
| 198 | } | ||||
| 199 | |||||
| 200 | sub end | ||||
| 201 | { | ||||
| 202 | my($self, $tag) = @_; | ||||
| 203 | print "END[$tag]\n" if $DEBUG; | ||||
| 204 | $self->flush_text if $self->{'tag'}; | ||||
| 205 | $self->eof if $tag eq 'head'; | ||||
| 206 | } | ||||
| 207 | |||||
| 208 | sub text | ||||
| 209 | # spent 703µs (543+160) within HTML::HeadParser::text which was called 7 times, avg 100µs/call:
# 7 times (543µs+160µs) by HTML::Parser::parse at line 114 of LWP/Protocol.pm, avg 100µs/call | ||||
| 210 | 49 | 710µs | my($self, $text) = @_; | ||
| 211 | $text =~ s/\x{FEFF}//; # drop Unicode BOM if found # spent 49µs making 7 calls to HTML::HeadParser::CORE:subst, avg 7µs/call | ||||
| 212 | print "TEXT[$text]\n" if $DEBUG; | ||||
| 213 | my $tag = $self->{tag}; | ||||
| 214 | if (!$tag && $text =~ /\S/) { # spent 63µs making 7 calls to HTML::HeadParser::CORE:match, avg 9µs/call | ||||
| 215 | # Normal text means start of body | ||||
| 216 | $self->eof; # spent 48µs making 7 calls to HTML::Parser::eof, avg 7µs/call | ||||
| 217 | return; | ||||
| 218 | } | ||||
| 219 | return if $tag ne 'title'; | ||||
| 220 | $self->{'text'} .= $text; | ||||
| 221 | } | ||||
| 222 | |||||
| 223 | 1 | 18µs | 1; | ||
| 224 | |||||
| 225 | __END__ | ||||
| 226 | |||||
| 227 | =back | ||||
| 228 | |||||
| 229 | =head1 EXAMPLE | ||||
| 230 | |||||
| 231 | $h = HTTP::Headers->new; | ||||
| 232 | $p = HTML::HeadParser->new($h); | ||||
| 233 | $p->parse(<<EOT); | ||||
| 234 | <title>Stupid example</title> | ||||
| 235 | <base href="http://www.linpro.no/lwp/"> | ||||
| 236 | Normal text starts here. | ||||
| 237 | EOT | ||||
| 238 | undef $p; | ||||
| 239 | print $h->title; # should print "Stupid example" | ||||
| 240 | |||||
| 241 | =head1 SEE ALSO | ||||
| 242 | |||||
| 243 | L<HTML::Parser>, L<HTTP::Headers> | ||||
| 244 | |||||
| 245 | The C<HTTP::Headers> class is distributed as part of the | ||||
| 246 | I<libwww-perl> package. If you don't have that distribution installed | ||||
| 247 | you need to provide the $header argument to the C<HTML::HeadParser> | ||||
| 248 | constructor with your own object that implements the documented | ||||
| 249 | protocol. | ||||
| 250 | |||||
| 251 | =head1 COPYRIGHT | ||||
| 252 | |||||
| 253 | Copyright 1996-2001 Gisle Aas. All rights reserved. | ||||
| 254 | |||||
| 255 | This library is free software; you can redistribute it and/or | ||||
| 256 | modify it under the same terms as Perl itself. | ||||
| 257 | |||||
| 258 | =cut | ||||
| 259 | |||||
# spent 83µs within HTML::HeadParser::CORE:match which was called 8 times, avg 10µs/call:
# 7 times (63µs+0s) by HTML::HeadParser::text at line 214 of HTML/HeadParser.pm, avg 9µs/call
# once (20µs+0s) by LWP::Protocol::collect at line 79 of HTML/HeadParser.pm | |||||
# spent 49µs within HTML::HeadParser::CORE:subst which was called 7 times, avg 7µs/call:
# 7 times (49µs+0s) by HTML::HeadParser::text at line 211 of HTML/HeadParser.pm, avg 7µs/call |