Profile of /project/perl/lib/HTML/HeadParser.pm

File	/project/perl/lib/HTML/HeadParser.pm
Statements Executed	3289
Statement Execution Time	38.6ms

Subroutines — ordered by exclusive time
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
461	1	1	34.9ms	195ms	HTML::HeadParser::::newHTML::HeadParser::new
7	1	1	543µs	703µs	HTML::HeadParser::::textHTML::HeadParser::text
8	2	2	83µs	83µs	HTML::HeadParser::::CORE:matchHTML::HeadParser::CORE:match (opcode)
7	1	2	49µs	49µs	HTML::HeadParser::::CORE:substHTML::HeadParser::CORE:subst (opcode)
0	0	0	0s	0s	HTML::HeadParser::::BEGINHTML::HeadParser::BEGIN
0	0	0	0s	0s	HTML::HeadParser::::as_stringHTML::HeadParser::as_string
0	0	0	0s	0s	HTML::HeadParser::::endHTML::HeadParser::end
0	0	0	0s	0s	HTML::HeadParser::::flush_textHTML::HeadParser::flush_text
0	0	0	0s	0s	HTML::HeadParser::::headerHTML::HeadParser::header
0	0	0	0s	0s	HTML::HeadParser::::startHTML::HeadParser::start

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					package HTML::HeadParser;
2
3					=head1 NAME
4
5					HTML::HeadParser - Parse <HEAD> section of a HTML document
6
7					=head1 SYNOPSIS
8
9					require HTML::HeadParser;
10					$p = HTML::HeadParser->new;
11					$p->parse($text) and print "not finished";
12
13					$p->header('Title') # to access <title>....</title>
14					$p->header('Content-Base') # to access <base href="http://...">
15					$p->header('Foo') # to access <meta http-equiv="Foo" content="...">
16
17					=head1 DESCRIPTION
18
19					The C<HTML::HeadParser> is a specialized (and lightweight)
20					C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
21					section of an HTML document. The parse() method
22					will return a FALSE value as soon as some E<lt>BODY> element or body
23					text are found, and should not be called again after this.
24
25					Note that the C<HTML::HeadParser> might get confused if raw undecoded
26					UTF-8 is passed to the parse() method. Make sure the strings are
27					properly decoded before passing them on.
28
29					The C<HTML::HeadParser> keeps a reference to a header object, and the
30					parser will update this header object as the various elements of the
31					E<lt>HEAD> section of the HTML document are recognized. The following
32					header fields are affected:
33
34					=over 4
35
36					=item Content-Base:
37
38					The I<Content-Base> header is initialized from the E<lt>base
39					href="..."> element.
40
41					=item Title:
42
43					The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
44					element.
45
46					=item Isindex:
47
48					The I<Isindex> header will be added if there is a E<lt>isindex>
49					element in the E<lt>head>. The header value is initialized from the
50					I<prompt> attribute if it is present. If no I<prompt> attribute is
51					given it will have '?' as the value.
52
53					=item X-Meta-Foo:
54
55					All E<lt>meta> elements will initialize headers with the prefix
56					"C<X-Meta->" on the name. If the E<lt>meta> element contains a
57					C<http-equiv> attribute, then it will be honored as the header name.
58
59					=back
60
61					=head1 METHODS
62
63					The following methods (in addition to those provided by the
64					superclass) are available:
65
66					=over 4
67
68					=cut
69
70
71	1	6µs			require HTML::Parser;
72	1	10µs			@ISA = qw(HTML::Parser);
73
74	3	660µs			use HTML::Entities ();
75
76	3	102µs	1	26µs	use strict; # spent 26µs making 1 call to strict::import
77	3	1.60ms	1	210µs	use vars qw($VERSION $DEBUG); # spent 210µs making 1 call to vars::import
78					#$DEBUG = 1;
79	1	62µs	1	20µs	$VERSION = sprintf("%d.%02d", q$Revision: 2.22 $ =~ /(\d+)\.(\d+)/); # spent 20µs making 1 call to HTML::HeadParser::CORE:match
80
81					=item $hp = HTML::HeadParser->new
82
83					=item $hp = HTML::HeadParser->new( $header )
84
85					The object constructor. The optional $header argument should be a
86					reference to an object that implement the header() and push_header()
87					methods as defined by the C<HTTP::Headers> class. Normally it will be
88					of some class that isa or delegates to the C<HTTP::Headers> class.
89
90					If no $header is given C<HTML::HeadParser> will create an
91					C<HTTP::Header> object by itself (initially empty).
92
93					=cut
94
95					sub new
96					# spent 195ms (34.9+160) within HTML::HeadParser::new which was called 461 times, avg 423µs/call: # 461 times (34.9ms+160ms) by LWP::Protocol::collect at line 106 of LWP/Protocol.pm, avg 423µs/call {
97	3227	35.4ms			my($class, $header) = @_;
98					unless ($header) {
99					require HTTP::Headers;
100					$header = HTTP::Headers->new;
101					}
102
103					my $self = $class->SUPER::new(api_version => 2, # spent 160ms making 461 calls to HTML::Parser::new, avg 347µs/call
104					ignore_elements => [qw(script style)],
105					);
106					$self->{'header'} = $header;
107					$self->{'tag'} = ''; # name of active element that takes textual content
108					$self->{'text'} = ''; # the accumulated text associated with the element
109					$self;
110					}
111
112					=item $hp->header;
113
114					Returns a reference to the header object.
115
116					=item $hp->header( $key )
117
118					Returns a header value. It is just a shorter way to write
119					C<$hp-E<gt>header-E<gt>header($key)>.
120
121					=cut
122
123					sub header
124					{
125					my $self = shift;
126					return $self->{'header'} unless @_;
127					$self->{'header'}->header(@_);
128					}
129
130					sub as_string # legacy
131					{
132					my $self = shift;
133					$self->{'header'}->as_string;
134					}
135
136					sub flush_text # internal
137					{
138					my $self = shift;
139					my $tag = $self->{'tag'};
140					my $text = $self->{'text'};
141					$text =~ s/^\s+//;
142					$text =~ s/\s+$//;
143					$text =~ s/\s+/ /g;
144					print "FLUSH $tag => '$text'\n" if $DEBUG;
145					if ($tag eq 'title') {
146					HTML::Entities::decode($text);
147					$self->{'header'}->push_header(Title => $text);
148					}
149					$self->{'tag'} = $self->{'text'} = '';
150					}
151
152					# This is an quote from the HTML3.2 DTD which shows which elements
153					# that might be present in a <HEAD>...</HEAD>. Also note that the
154					# <HEAD> tags themselves might be missing:
155					#
156					# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
157					# SCRIPT* & META* & LINK*">
158					#
159					# <!ELEMENT HEAD O O (%head.content)>
160
161
162					sub start
163					{
164					my($self, $tag, $attr) = @_; # $attr is reference to a HASH
165					print "START[$tag]\n" if $DEBUG;
166					$self->flush_text if $self->{'tag'};
167					if ($tag eq 'meta') {
168					my $key = $attr->{'http-equiv'};
169					if (!defined($key) \|\| !length($key)) {
170					return unless $attr->{'name'};
171					$key = "X-Meta-\u$attr->{'name'}";
172					}
173					$self->{'header'}->push_header($key => $attr->{content});
174					} elsif ($tag eq 'base') {
175					return unless exists $attr->{href};
176					$self->{'header'}->push_header('Content-Base' => $attr->{href});
177					} elsif ($tag eq 'isindex') {
178					# This is a non-standard header. Perhaps we should just ignore
179					# this element
180					$self->{'header'}->push_header(Isindex => $attr->{prompt} \|\| '?');
181					} elsif ($tag =~ /^(?:title\|script\|style)$/) {
182					# Just remember tag. Initialize header when we see the end tag.
183					$self->{'tag'} = $tag;
184					} elsif ($tag eq 'link') {
185					return unless exists $attr->{href};
186					# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
187					my $h_val = "<" . delete($attr->{href}) . ">";
188					for (sort keys %{$attr}) {
189					$h_val .= qq(; $_="$attr->{$_}");
190					}
191					$self->{'header'}->push_header(Link => $h_val);
192					} elsif ($tag eq 'head' \|\| $tag eq 'html') {
193					# ignore
194					} else {
195					# stop parsing
196					$self->eof;
197					}
198					}
199
200					sub end
201					{
202					my($self, $tag) = @_;
203					print "END[$tag]\n" if $DEBUG;
204					$self->flush_text if $self->{'tag'};
205					$self->eof if $tag eq 'head';
206					}
207
208					sub text
209					# spent 703µs (543+160) within HTML::HeadParser::text which was called 7 times, avg 100µs/call: # 7 times (543µs+160µs) by HTML::Parser::parse at line 114 of LWP/Protocol.pm, avg 100µs/call {
210	35	439µs			my($self, $text) = @_;
211					$text =~ s/\x{FEFF}//; # drop Unicode BOM if found # spent 49µs making 7 calls to HTML::HeadParser::CORE:subst, avg 7µs/call
212					print "TEXT[$text]\n" if $DEBUG;
213					my $tag = $self->{tag};
214	14	271µs	7	63µs	if (!$tag && $text =~ /\S/) { # spent 63µs making 7 calls to HTML::HeadParser::CORE:match, avg 9µs/call
215					# Normal text means start of body
216					$self->eof; # spent 48µs making 7 calls to HTML::Parser::eof, avg 7µs/call
217					return;
218					}
219					return if $tag ne 'title';
220					$self->{'text'} .= $text;
221					}
222
223	1	18µs			1;
224
225					__END__
226
227					=back
228
229					=head1 EXAMPLE
230
231					$h = HTTP::Headers->new;
232					$p = HTML::HeadParser->new($h);
233					$p->parse(<<EOT);
234					<title>Stupid example</title>
235					<base href="http://www.linpro.no/lwp/">
236					Normal text starts here.
237					EOT
238					undef $p;
239					print $h->title; # should print "Stupid example"
240
241					=head1 SEE ALSO
242
243					L<HTML::Parser>, L<HTTP::Headers>
244
245					The C<HTTP::Headers> class is distributed as part of the
246					I<libwww-perl> package. If you don't have that distribution installed
247					you need to provide the $header argument to the C<HTML::HeadParser>
248					constructor with your own object that implements the documented
249					protocol.
250
251					=head1 COPYRIGHT
252
253					Copyright 1996-2001 Gisle Aas. All rights reserved.
254
255					This library is free software; you can redistribute it and/or
256					modify it under the same terms as Perl itself.
257
258					=cut
259
					# spent 83µs within HTML::HeadParser::CORE:match which was called 8 times, avg 10µs/call: # 7 times (63µs+0s) by HTML::HeadParser::text at line 214 of HTML/HeadParser.pm, avg 9µs/call # once (20µs+0s) by LWP::Protocol::collect at line 79 of HTML/HeadParser.pm sub HTML::HeadParser::CORE:match; # xsub
					# spent 49µs within HTML::HeadParser::CORE:subst which was called 7 times, avg 7µs/call: # 7 times (49µs+0s) by HTML::HeadParser::text at line 211 of HTML/HeadParser.pm, avg 7µs/call sub HTML::HeadParser::CORE:subst; # xsub