File | /project/perl/lib/HTML/HeadParser.pm |
Statements Executed | 3289 |
Statement Execution Time | 38.6ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
461 | 1 | 1 | 34.9ms | 195ms | new | HTML::HeadParser::
7 | 1 | 1 | 543µs | 703µs | text | HTML::HeadParser::
8 | 2 | 2 | 83µs | 83µs | CORE:match (opcode) | HTML::HeadParser::
7 | 1 | 2 | 49µs | 49µs | CORE:subst (opcode) | HTML::HeadParser::
0 | 0 | 0 | 0s | 0s | BEGIN | HTML::HeadParser::
0 | 0 | 0 | 0s | 0s | as_string | HTML::HeadParser::
0 | 0 | 0 | 0s | 0s | end | HTML::HeadParser::
0 | 0 | 0 | 0s | 0s | flush_text | HTML::HeadParser::
0 | 0 | 0 | 0s | 0s | header | HTML::HeadParser::
0 | 0 | 0 | 0s | 0s | start | HTML::HeadParser::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | package HTML::HeadParser; | ||||
2 | |||||
3 | =head1 NAME | ||||
4 | |||||
5 | HTML::HeadParser - Parse <HEAD> section of a HTML document | ||||
6 | |||||
7 | =head1 SYNOPSIS | ||||
8 | |||||
9 | require HTML::HeadParser; | ||||
10 | $p = HTML::HeadParser->new; | ||||
11 | $p->parse($text) and print "not finished"; | ||||
12 | |||||
13 | $p->header('Title') # to access <title>....</title> | ||||
14 | $p->header('Content-Base') # to access <base href="http://..."> | ||||
15 | $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> | ||||
16 | |||||
17 | =head1 DESCRIPTION | ||||
18 | |||||
19 | The C<HTML::HeadParser> is a specialized (and lightweight) | ||||
20 | C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> | ||||
21 | section of an HTML document. The parse() method | ||||
22 | will return a FALSE value as soon as some E<lt>BODY> element or body | ||||
23 | text are found, and should not be called again after this. | ||||
24 | |||||
25 | Note that the C<HTML::HeadParser> might get confused if raw undecoded | ||||
26 | UTF-8 is passed to the parse() method. Make sure the strings are | ||||
27 | properly decoded before passing them on. | ||||
28 | |||||
29 | The C<HTML::HeadParser> keeps a reference to a header object, and the | ||||
30 | parser will update this header object as the various elements of the | ||||
31 | E<lt>HEAD> section of the HTML document are recognized. The following | ||||
32 | header fields are affected: | ||||
33 | |||||
34 | =over 4 | ||||
35 | |||||
36 | =item Content-Base: | ||||
37 | |||||
38 | The I<Content-Base> header is initialized from the E<lt>base | ||||
39 | href="..."> element. | ||||
40 | |||||
41 | =item Title: | ||||
42 | |||||
43 | The I<Title> header is initialized from the E<lt>title>...E<lt>/title> | ||||
44 | element. | ||||
45 | |||||
46 | =item Isindex: | ||||
47 | |||||
48 | The I<Isindex> header will be added if there is a E<lt>isindex> | ||||
49 | element in the E<lt>head>. The header value is initialized from the | ||||
50 | I<prompt> attribute if it is present. If no I<prompt> attribute is | ||||
51 | given it will have '?' as the value. | ||||
52 | |||||
53 | =item X-Meta-Foo: | ||||
54 | |||||
55 | All E<lt>meta> elements will initialize headers with the prefix | ||||
56 | "C<X-Meta->" on the name. If the E<lt>meta> element contains a | ||||
57 | C<http-equiv> attribute, then it will be honored as the header name. | ||||
58 | |||||
59 | =back | ||||
60 | |||||
61 | =head1 METHODS | ||||
62 | |||||
63 | The following methods (in addition to those provided by the | ||||
64 | superclass) are available: | ||||
65 | |||||
66 | =over 4 | ||||
67 | |||||
68 | =cut | ||||
69 | |||||
70 | |||||
71 | 1 | 6µs | require HTML::Parser; | ||
72 | 1 | 10µs | @ISA = qw(HTML::Parser); | ||
73 | |||||
74 | 3 | 660µs | use HTML::Entities (); | ||
75 | |||||
76 | 3 | 102µs | 1 | 26µs | use strict; # spent 26µs making 1 call to strict::import |
77 | 3 | 1.60ms | 1 | 210µs | use vars qw($VERSION $DEBUG); # spent 210µs making 1 call to vars::import |
78 | #$DEBUG = 1; | ||||
79 | 1 | 62µs | 1 | 20µs | $VERSION = sprintf("%d.%02d", q$Revision: 2.22 $ =~ /(\d+)\.(\d+)/); # spent 20µs making 1 call to HTML::HeadParser::CORE:match |
80 | |||||
81 | =item $hp = HTML::HeadParser->new | ||||
82 | |||||
83 | =item $hp = HTML::HeadParser->new( $header ) | ||||
84 | |||||
85 | The object constructor. The optional $header argument should be a | ||||
86 | reference to an object that implement the header() and push_header() | ||||
87 | methods as defined by the C<HTTP::Headers> class. Normally it will be | ||||
88 | of some class that isa or delegates to the C<HTTP::Headers> class. | ||||
89 | |||||
90 | If no $header is given C<HTML::HeadParser> will create an | ||||
91 | C<HTTP::Header> object by itself (initially empty). | ||||
92 | |||||
93 | =cut | ||||
94 | |||||
95 | sub new | ||||
96 | # spent 195ms (34.9+160) within HTML::HeadParser::new which was called 461 times, avg 423µs/call:
# 461 times (34.9ms+160ms) by LWP::Protocol::collect at line 106 of LWP/Protocol.pm, avg 423µs/call | ||||
97 | 3227 | 35.4ms | my($class, $header) = @_; | ||
98 | unless ($header) { | ||||
99 | require HTTP::Headers; | ||||
100 | $header = HTTP::Headers->new; | ||||
101 | } | ||||
102 | |||||
103 | my $self = $class->SUPER::new(api_version => 2, # spent 160ms making 461 calls to HTML::Parser::new, avg 347µs/call | ||||
104 | ignore_elements => [qw(script style)], | ||||
105 | ); | ||||
106 | $self->{'header'} = $header; | ||||
107 | $self->{'tag'} = ''; # name of active element that takes textual content | ||||
108 | $self->{'text'} = ''; # the accumulated text associated with the element | ||||
109 | $self; | ||||
110 | } | ||||
111 | |||||
112 | =item $hp->header; | ||||
113 | |||||
114 | Returns a reference to the header object. | ||||
115 | |||||
116 | =item $hp->header( $key ) | ||||
117 | |||||
118 | Returns a header value. It is just a shorter way to write | ||||
119 | C<$hp-E<gt>header-E<gt>header($key)>. | ||||
120 | |||||
121 | =cut | ||||
122 | |||||
123 | sub header | ||||
124 | { | ||||
125 | my $self = shift; | ||||
126 | return $self->{'header'} unless @_; | ||||
127 | $self->{'header'}->header(@_); | ||||
128 | } | ||||
129 | |||||
130 | sub as_string # legacy | ||||
131 | { | ||||
132 | my $self = shift; | ||||
133 | $self->{'header'}->as_string; | ||||
134 | } | ||||
135 | |||||
136 | sub flush_text # internal | ||||
137 | { | ||||
138 | my $self = shift; | ||||
139 | my $tag = $self->{'tag'}; | ||||
140 | my $text = $self->{'text'}; | ||||
141 | $text =~ s/^\s+//; | ||||
142 | $text =~ s/\s+$//; | ||||
143 | $text =~ s/\s+/ /g; | ||||
144 | print "FLUSH $tag => '$text'\n" if $DEBUG; | ||||
145 | if ($tag eq 'title') { | ||||
146 | HTML::Entities::decode($text); | ||||
147 | $self->{'header'}->push_header(Title => $text); | ||||
148 | } | ||||
149 | $self->{'tag'} = $self->{'text'} = ''; | ||||
150 | } | ||||
151 | |||||
152 | # This is an quote from the HTML3.2 DTD which shows which elements | ||||
153 | # that might be present in a <HEAD>...</HEAD>. Also note that the | ||||
154 | # <HEAD> tags themselves might be missing: | ||||
155 | # | ||||
156 | # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & | ||||
157 | # SCRIPT* & META* & LINK*"> | ||||
158 | # | ||||
159 | # <!ELEMENT HEAD O O (%head.content)> | ||||
160 | |||||
161 | |||||
162 | sub start | ||||
163 | { | ||||
164 | my($self, $tag, $attr) = @_; # $attr is reference to a HASH | ||||
165 | print "START[$tag]\n" if $DEBUG; | ||||
166 | $self->flush_text if $self->{'tag'}; | ||||
167 | if ($tag eq 'meta') { | ||||
168 | my $key = $attr->{'http-equiv'}; | ||||
169 | if (!defined($key) || !length($key)) { | ||||
170 | return unless $attr->{'name'}; | ||||
171 | $key = "X-Meta-\u$attr->{'name'}"; | ||||
172 | } | ||||
173 | $self->{'header'}->push_header($key => $attr->{content}); | ||||
174 | } elsif ($tag eq 'base') { | ||||
175 | return unless exists $attr->{href}; | ||||
176 | $self->{'header'}->push_header('Content-Base' => $attr->{href}); | ||||
177 | } elsif ($tag eq 'isindex') { | ||||
178 | # This is a non-standard header. Perhaps we should just ignore | ||||
179 | # this element | ||||
180 | $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?'); | ||||
181 | } elsif ($tag =~ /^(?:title|script|style)$/) { | ||||
182 | # Just remember tag. Initialize header when we see the end tag. | ||||
183 | $self->{'tag'} = $tag; | ||||
184 | } elsif ($tag eq 'link') { | ||||
185 | return unless exists $attr->{href}; | ||||
186 | # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> | ||||
187 | my $h_val = "<" . delete($attr->{href}) . ">"; | ||||
188 | for (sort keys %{$attr}) { | ||||
189 | $h_val .= qq(; $_="$attr->{$_}"); | ||||
190 | } | ||||
191 | $self->{'header'}->push_header(Link => $h_val); | ||||
192 | } elsif ($tag eq 'head' || $tag eq 'html') { | ||||
193 | # ignore | ||||
194 | } else { | ||||
195 | # stop parsing | ||||
196 | $self->eof; | ||||
197 | } | ||||
198 | } | ||||
199 | |||||
200 | sub end | ||||
201 | { | ||||
202 | my($self, $tag) = @_; | ||||
203 | print "END[$tag]\n" if $DEBUG; | ||||
204 | $self->flush_text if $self->{'tag'}; | ||||
205 | $self->eof if $tag eq 'head'; | ||||
206 | } | ||||
207 | |||||
208 | sub text | ||||
209 | # spent 703µs (543+160) within HTML::HeadParser::text which was called 7 times, avg 100µs/call:
# 7 times (543µs+160µs) by HTML::Parser::parse at line 114 of LWP/Protocol.pm, avg 100µs/call | ||||
210 | 35 | 439µs | my($self, $text) = @_; | ||
211 | $text =~ s/\x{FEFF}//; # drop Unicode BOM if found # spent 49µs making 7 calls to HTML::HeadParser::CORE:subst, avg 7µs/call | ||||
212 | print "TEXT[$text]\n" if $DEBUG; | ||||
213 | my $tag = $self->{tag}; | ||||
214 | 14 | 271µs | 7 | 63µs | if (!$tag && $text =~ /\S/) { # spent 63µs making 7 calls to HTML::HeadParser::CORE:match, avg 9µs/call |
215 | # Normal text means start of body | ||||
216 | $self->eof; # spent 48µs making 7 calls to HTML::Parser::eof, avg 7µs/call | ||||
217 | return; | ||||
218 | } | ||||
219 | return if $tag ne 'title'; | ||||
220 | $self->{'text'} .= $text; | ||||
221 | } | ||||
222 | |||||
223 | 1 | 18µs | 1; | ||
224 | |||||
225 | __END__ | ||||
226 | |||||
227 | =back | ||||
228 | |||||
229 | =head1 EXAMPLE | ||||
230 | |||||
231 | $h = HTTP::Headers->new; | ||||
232 | $p = HTML::HeadParser->new($h); | ||||
233 | $p->parse(<<EOT); | ||||
234 | <title>Stupid example</title> | ||||
235 | <base href="http://www.linpro.no/lwp/"> | ||||
236 | Normal text starts here. | ||||
237 | EOT | ||||
238 | undef $p; | ||||
239 | print $h->title; # should print "Stupid example" | ||||
240 | |||||
241 | =head1 SEE ALSO | ||||
242 | |||||
243 | L<HTML::Parser>, L<HTTP::Headers> | ||||
244 | |||||
245 | The C<HTTP::Headers> class is distributed as part of the | ||||
246 | I<libwww-perl> package. If you don't have that distribution installed | ||||
247 | you need to provide the $header argument to the C<HTML::HeadParser> | ||||
248 | constructor with your own object that implements the documented | ||||
249 | protocol. | ||||
250 | |||||
251 | =head1 COPYRIGHT | ||||
252 | |||||
253 | Copyright 1996-2001 Gisle Aas. All rights reserved. | ||||
254 | |||||
255 | This library is free software; you can redistribute it and/or | ||||
256 | modify it under the same terms as Perl itself. | ||||
257 | |||||
258 | =cut | ||||
259 | |||||
# spent 83µs within HTML::HeadParser::CORE:match which was called 8 times, avg 10µs/call:
# 7 times (63µs+0s) by HTML::HeadParser::text at line 214 of HTML/HeadParser.pm, avg 9µs/call
# once (20µs+0s) by LWP::Protocol::collect at line 79 of HTML/HeadParser.pm | |||||
# spent 49µs within HTML::HeadParser::CORE:subst which was called 7 times, avg 7µs/call:
# 7 times (49µs+0s) by HTML::HeadParser::text at line 211 of HTML/HeadParser.pm, avg 7µs/call |