Profile of /project/perl/lib/HTML/Entities.pm

File	/project/perl/lib/HTML/Entities.pm
Statements Executed	944
Statement Execution Time	16.1ms

Subroutines — ordered by exclusive time
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
253	1	2	1.27ms	1.27ms	HTML::Entities::::CORE:substHTML::Entities::CORE:subst (opcode)
1	1	2	18µs	18µs	HTML::Entities::::CORE:matchHTML::Entities::CORE:match (opcode)
0	0	0	0s	0s	HTML::Entities::::BEGINHTML::Entities::BEGIN
0	0	0	0s	0s	HTML::Entities::::VersionHTML::Entities::Version
0	0	0	0s	0s	HTML::Entities::::decode_entities_oldHTML::Entities::decode_entities_old
0	0	0	0s	0s	HTML::Entities::::encode_entitiesHTML::Entities::encode_entities
0	0	0	0s	0s	HTML::Entities::encode_entities_numeric
0	0	0	0s	0s	HTML::Entities::::num_entityHTML::Entities::num_entity

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					package HTML::Entities;
2
3					# $Id: Entities.pm,v 1.35 2006/03/22 09:15:23 gisle Exp $
4
5					=head1 NAME
6
7					HTML::Entities - Encode or decode strings with HTML entities
8
9					=head1 SYNOPSIS
10
11					use HTML::Entities;
12
13					$a = "Våre norske tegn bør &#230res";
14					decode_entities($a);
15					encode_entities($a, "\200-\377");
16
17					For example, this:
18
19					$input = "vis-�-vis Beyonc�'s na�ve\npapier-m�ch� r�sum�";
20					print encode_entities($input), "\n"
21
22					Prints this out:
23
24					vis-à-vis Beyoncé's naïve
25					papier-mâché résumé
26
27					=head1 DESCRIPTION
28
29					This module deals with encoding and decoding of strings with HTML
30					character entities. The module provides the following functions:
31
32					=over 4
33
34					=item decode_entities( $string, ... )
35
36					This routine replaces HTML entities found in the $string with the
37					corresponding Unicode character. Under perl 5.6 and earlier only
38					characters in the Latin-1 range are replaced. Unrecognized
39					entities are left alone.
40
41					If multiple strings are provided as argument they are each decoded
42					separately and the same number of strings are returned.
43
44					If called in void context the arguments are decoded in-place.
45
46					This routine is exported by default.
47
48					=item _decode_entities( $string, \%entity2char )
49
50					=item _decode_entities( $string, \%entity2char, $expand_prefix )
51
52					This will in-place replace HTML entities in $string. The %entity2char
53					hash must be provided. Named entities not found in the %entity2char
54					hash are left alone. Numeric entities are expanded unless their value
55					overflow.
56
57					The keys in %entity2char are the entity names to be expanded and their
58					values are what they should expand into. The values do not have to be
59					single character strings. If a key has ";" as suffix,
60					then occurrences in $string are only expanded if properly terminated
61					with ";". Entities without ";" will be expanded regardless of how
62					they are terminated for compatiblity with how common browsers treat
63					entities in the Latin-1 range.
64
65					If $expand_prefix is TRUE then entities without trailing ";" in
66					%entity2char will even be expanded as a prefix of a longer
67					unrecognized name. The longest matching name in %entity2char will be
68					used. This is mainly present for compatibility with an MSIE
69					misfeature.
70
71					$string = "foo&nbspbar";
72					_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
73					print $string; # will print "foo�bar"
74
75					This routine is exported by default.
76
77					=item encode_entities( $string )
78
79					=item encode_entities( $string, $unsafe_chars )
80
81					This routine replaces unsafe characters in $string with their entity
82					representation. A second argument can be given to specify which
83					characters to consider unsafe (i.e., which to escape). The default set
84					of characters to encode are control chars, high-bit chars, and the
85					C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >>
86					characters. But this, for example, would encode I<just> the
87					C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
88
89					$encoded = encode_entities($input, '<>&"');
90
91					This routine is exported by default.
92
93					=item encode_entities_numeric( $string )
94
95					=item encode_entities_numeric( $string, $unsafe_chars )
96
97					This routine works just like encode_entities, except that the replacement
98					entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
99					example, C<encode_entities("r\xF4le")> returns "rôle", but
100					C<encode_entities_numeric("r\xF4le")> returns "rôle".
101
102					This routine is I<not> exported by default. But you can always
103					export it with C<use HTML::Entities qw(encode_entities_numeric);>
104					or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
105
106					=back
107
108					All these routines modify the string passed as the first argument, if
109					called in a void context. In scalar and array contexts, the encoded or
110					decoded string is returned (without changing the input string).
111
112					If you prefer not to import these routines into your namespace, you can
113					call them as:
114
115					use HTML::Entities ();
116					$decoded = HTML::Entities::decode($a);
117					$encoded = HTML::Entities::encode($a);
118					$encoded = HTML::Entities::encode_numeric($a);
119
120					The module can also export the %char2entity and the %entity2char
121					hashes, which contain the mapping from all characters to the
122					corresponding entities (and vice versa, respectively).
123
124					=head1 COPYRIGHT
125
126					Copyright 1995-2006 Gisle Aas. All rights reserved.
127
128					This library is free software; you can redistribute it and/or
129					modify it under the same terms as Perl itself.
130
131					=cut
132
133	3	111µs	1	44µs	use strict; # spent 44µs making 1 call to strict::import
134	3	94µs	1	362µs	use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # spent 362µs making 1 call to vars::import
135	3	2.57ms	1	215µs	use vars qw(%entity2char %char2entity); # spent 215µs making 1 call to vars::import
136
137	1	5µs			require 5.004;
138	1	7µs			require Exporter;
139	1	12µs			@ISA = qw(Exporter);
140
141	1	6µs			@EXPORT = qw(encode_entities decode_entities _decode_entities);
142	1	5µs			@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
143
144	1	63µs	1	18µs	$VERSION = sprintf("%d.%02d", q$Revision: 1.35 $ =~ /(\d+)\.(\d+)/); # spent 18µs making 1 call to HTML::Entities::CORE:match
145					sub Version { $VERSION; }
146
147	1	729µs			require HTML::Parser; # for fast XS implemented decode_entities
148
149
150	1	1.23ms			%entity2char = (
151					# Some normal chars that have special meaning in SGML context
152					amp => '&', # ampersand
153					'gt' => '>', # greater than
154					'lt' => '<', # less than
155					quot => '"', # double quote
156					apos => "'", # single quote
157
158					# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
159					AElig => chr(198), # capital AE diphthong (ligature)
160					Aacute => chr(193), # capital A, acute accent
161					Acirc => chr(194), # capital A, circumflex accent
162					Agrave => chr(192), # capital A, grave accent
163					Aring => chr(197), # capital A, ring
164					Atilde => chr(195), # capital A, tilde
165					Auml => chr(196), # capital A, dieresis or umlaut mark
166					Ccedil => chr(199), # capital C, cedilla
167					ETH => chr(208), # capital Eth, Icelandic
168					Eacute => chr(201), # capital E, acute accent
169					Ecirc => chr(202), # capital E, circumflex accent
170					Egrave => chr(200), # capital E, grave accent
171					Euml => chr(203), # capital E, dieresis or umlaut mark
172					Iacute => chr(205), # capital I, acute accent
173					Icirc => chr(206), # capital I, circumflex accent
174					Igrave => chr(204), # capital I, grave accent
175					Iuml => chr(207), # capital I, dieresis or umlaut mark
176					Ntilde => chr(209), # capital N, tilde
177					Oacute => chr(211), # capital O, acute accent
178					Ocirc => chr(212), # capital O, circumflex accent
179					Ograve => chr(210), # capital O, grave accent
180					Oslash => chr(216), # capital O, slash
181					Otilde => chr(213), # capital O, tilde
182					Ouml => chr(214), # capital O, dieresis or umlaut mark
183					THORN => chr(222), # capital THORN, Icelandic
184					Uacute => chr(218), # capital U, acute accent
185					Ucirc => chr(219), # capital U, circumflex accent
186					Ugrave => chr(217), # capital U, grave accent
187					Uuml => chr(220), # capital U, dieresis or umlaut mark
188					Yacute => chr(221), # capital Y, acute accent
189					aacute => chr(225), # small a, acute accent
190					acirc => chr(226), # small a, circumflex accent
191					aelig => chr(230), # small ae diphthong (ligature)
192					agrave => chr(224), # small a, grave accent
193					aring => chr(229), # small a, ring
194					atilde => chr(227), # small a, tilde
195					auml => chr(228), # small a, dieresis or umlaut mark
196					ccedil => chr(231), # small c, cedilla
197					eacute => chr(233), # small e, acute accent
198					ecirc => chr(234), # small e, circumflex accent
199					egrave => chr(232), # small e, grave accent
200					eth => chr(240), # small eth, Icelandic
201					euml => chr(235), # small e, dieresis or umlaut mark
202					iacute => chr(237), # small i, acute accent
203					icirc => chr(238), # small i, circumflex accent
204					igrave => chr(236), # small i, grave accent
205					iuml => chr(239), # small i, dieresis or umlaut mark
206					ntilde => chr(241), # small n, tilde
207					oacute => chr(243), # small o, acute accent
208					ocirc => chr(244), # small o, circumflex accent
209					ograve => chr(242), # small o, grave accent
210					oslash => chr(248), # small o, slash
211					otilde => chr(245), # small o, tilde
212					ouml => chr(246), # small o, dieresis or umlaut mark
213					szlig => chr(223), # small sharp s, German (sz ligature)
214					thorn => chr(254), # small thorn, Icelandic
215					uacute => chr(250), # small u, acute accent
216					ucirc => chr(251), # small u, circumflex accent
217					ugrave => chr(249), # small u, grave accent
218					uuml => chr(252), # small u, dieresis or umlaut mark
219					yacute => chr(253), # small y, acute accent
220					yuml => chr(255), # small y, dieresis or umlaut mark
221
222					# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
223					copy => chr(169), # copyright sign
224					reg => chr(174), # registered sign
225					nbsp => chr(160), # non breaking space
226
227					# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
228					iexcl => chr(161),
229					cent => chr(162),
230					pound => chr(163),
231					curren => chr(164),
232					yen => chr(165),
233					brvbar => chr(166),
234					sect => chr(167),
235					uml => chr(168),
236					ordf => chr(170),
237					laquo => chr(171),
238					'not' => chr(172), # not is a keyword in perl
239					shy => chr(173),
240					macr => chr(175),
241					deg => chr(176),
242					plusmn => chr(177),
243					sup1 => chr(185),
244					sup2 => chr(178),
245					sup3 => chr(179),
246					acute => chr(180),
247					micro => chr(181),
248					para => chr(182),
249					middot => chr(183),
250					cedil => chr(184),
251					ordm => chr(186),
252					raquo => chr(187),
253					frac14 => chr(188),
254					frac12 => chr(189),
255					frac34 => chr(190),
256					iquest => chr(191),
257					'times' => chr(215), # times is a keyword in perl
258					divide => chr(247),
259
260					( $] > 5.007 ? (
261					'OElig;' => chr(338),
262					'oelig;' => chr(339),
263					'Scaron;' => chr(352),
264					'scaron;' => chr(353),
265					'Yuml;' => chr(376),
266					'fnof;' => chr(402),
267					'circ;' => chr(710),
268					'tilde;' => chr(732),
269					'Alpha;' => chr(913),
270					'Beta;' => chr(914),
271					'Gamma;' => chr(915),
272					'Delta;' => chr(916),
273					'Epsilon;' => chr(917),
274					'Zeta;' => chr(918),
275					'Eta;' => chr(919),
276					'Theta;' => chr(920),
277					'Iota;' => chr(921),
278					'Kappa;' => chr(922),
279					'Lambda;' => chr(923),
280					'Mu;' => chr(924),
281					'Nu;' => chr(925),
282					'Xi;' => chr(926),
283					'Omicron;' => chr(927),
284					'Pi;' => chr(928),
285					'Rho;' => chr(929),
286					'Sigma;' => chr(931),
287					'Tau;' => chr(932),
288					'Upsilon;' => chr(933),
289					'Phi;' => chr(934),
290					'Chi;' => chr(935),
291					'Psi;' => chr(936),
292					'Omega;' => chr(937),
293					'alpha;' => chr(945),
294					'beta;' => chr(946),
295					'gamma;' => chr(947),
296					'delta;' => chr(948),
297					'epsilon;' => chr(949),
298					'zeta;' => chr(950),
299					'eta;' => chr(951),
300					'theta;' => chr(952),
301					'iota;' => chr(953),
302					'kappa;' => chr(954),
303					'lambda;' => chr(955),
304					'mu;' => chr(956),
305					'nu;' => chr(957),
306					'xi;' => chr(958),
307					'omicron;' => chr(959),
308					'pi;' => chr(960),
309					'rho;' => chr(961),
310					'sigmaf;' => chr(962),
311					'sigma;' => chr(963),
312					'tau;' => chr(964),
313					'upsilon;' => chr(965),
314					'phi;' => chr(966),
315					'chi;' => chr(967),
316					'psi;' => chr(968),
317					'omega;' => chr(969),
318					'thetasym;' => chr(977),
319					'upsih;' => chr(978),
320					'piv;' => chr(982),
321					'ensp;' => chr(8194),
322					'emsp;' => chr(8195),
323					'thinsp;' => chr(8201),
324					'zwnj;' => chr(8204),
325					'zwj;' => chr(8205),
326					'lrm;' => chr(8206),
327					'rlm;' => chr(8207),
328					'ndash;' => chr(8211),
329					'mdash;' => chr(8212),
330					'lsquo;' => chr(8216),
331					'rsquo;' => chr(8217),
332					'sbquo;' => chr(8218),
333					'ldquo;' => chr(8220),
334					'rdquo;' => chr(8221),
335					'bdquo;' => chr(8222),
336					'dagger;' => chr(8224),
337					'Dagger;' => chr(8225),
338					'bull;' => chr(8226),
339					'hellip;' => chr(8230),
340					'permil;' => chr(8240),
341					'prime;' => chr(8242),
342					'Prime;' => chr(8243),
343					'lsaquo;' => chr(8249),
344					'rsaquo;' => chr(8250),
345					'oline;' => chr(8254),
346					'frasl;' => chr(8260),
347					'euro;' => chr(8364),
348					'image;' => chr(8465),
349					'weierp;' => chr(8472),
350					'real;' => chr(8476),
351					'trade;' => chr(8482),
352					'alefsym;' => chr(8501),
353					'larr;' => chr(8592),
354					'uarr;' => chr(8593),
355					'rarr;' => chr(8594),
356					'darr;' => chr(8595),
357					'harr;' => chr(8596),
358					'crarr;' => chr(8629),
359					'lArr;' => chr(8656),
360					'uArr;' => chr(8657),
361					'rArr;' => chr(8658),
362					'dArr;' => chr(8659),
363					'hArr;' => chr(8660),
364					'forall;' => chr(8704),
365					'part;' => chr(8706),
366					'exist;' => chr(8707),
367					'empty;' => chr(8709),
368					'nabla;' => chr(8711),
369					'isin;' => chr(8712),
370					'notin;' => chr(8713),
371					'ni;' => chr(8715),
372					'prod;' => chr(8719),
373					'sum;' => chr(8721),
374					'minus;' => chr(8722),
375					'lowast;' => chr(8727),
376					'radic;' => chr(8730),
377					'prop;' => chr(8733),
378					'infin;' => chr(8734),
379					'ang;' => chr(8736),
380					'and;' => chr(8743),
381					'or;' => chr(8744),
382					'cap;' => chr(8745),
383					'cup;' => chr(8746),
384					'int;' => chr(8747),
385					'there4;' => chr(8756),
386					'sim;' => chr(8764),
387					'cong;' => chr(8773),
388					'asymp;' => chr(8776),
389					'ne;' => chr(8800),
390					'equiv;' => chr(8801),
391					'le;' => chr(8804),
392					'ge;' => chr(8805),
393					'sub;' => chr(8834),
394					'sup;' => chr(8835),
395					'nsub;' => chr(8836),
396					'sube;' => chr(8838),
397					'supe;' => chr(8839),
398					'oplus;' => chr(8853),
399					'otimes;' => chr(8855),
400					'perp;' => chr(8869),
401					'sdot;' => chr(8901),
402					'lceil;' => chr(8968),
403					'rceil;' => chr(8969),
404					'lfloor;' => chr(8970),
405					'rfloor;' => chr(8971),
406					'lang;' => chr(9001),
407					'rang;' => chr(9002),
408					'loz;' => chr(9674),
409					'spades;' => chr(9824),
410					'clubs;' => chr(9827),
411					'hearts;' => chr(9829),
412					'diams;' => chr(9830),
413					) : ())
414					);
415
416
417					# Make the opposite mapping
418	1	20µs			while (my($entity, $char) = each(%entity2char)) {
419	253	4.28ms	253	1.27ms	$entity =~ s/;\z//; # spent 1.27ms making 253 calls to HTML::Entities::CORE:subst, avg 5µs/call
420	253	3.63ms			$char2entity{$char} = "&$entity;";
421					}
422	1	7µs			delete $char2entity{"'"}; # only one-way decoding
423
424					# Fill in missing entities
425	1	438µs			for (0 .. 255) {
426	256	1.12ms			next if exists $char2entity{chr($_)};
427	156	1.57ms			$char2entity{chr($_)} = "&#$_;";
428					}
429
430	1	5µs			my %subst; # compiled encoding regexps
431
432					sub decode_entities_old
433					{
434					my $array;
435					if (defined wantarray) {
436					$array = [@_]; # copy
437					} else {
438					$array = \@_; # modify in-place
439					}
440					my $c;
441					for (@$array) {
442					s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
443					s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
444					s/(&(\w+);?)/$entity2char{$2} \|\| $1/eg;
445					}
446					wantarray ? @$array : $array->[0];
447					}
448
449					sub encode_entities
450					{
451					my $ref;
452					if (defined wantarray) {
453					my $x = $_[0];
454					$ref = \$x; # copy
455					} else {
456					$ref = \$_[0]; # modify in-place
457					}
458					if (defined $_[1] and length $_[1]) {
459					unless (exists $subst{$_[1]}) {
460					# Because we can't compile regex we fake it with a cached sub
461					my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} \|\| num_entity(\$1)/ge; }";
462					$subst{$_[1]} = eval $code;
463					die( $@ . " while trying to turn range: \"$_[1]\"\n "
464					. "into code: $code\n "
465					) if $@;
466					}
467					&{$subst{$_[1]}}($$ref);
468					} else {
469					# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
470					$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} \|\| num_entity($1)/ge;
471					}
472					$$ref;
473					}
474
475					sub encode_entities_numeric {
476					local %char2entity;
477					return &encode_entities; # a goto &encode_entities wouldn't work
478					}
479
480
481					sub num_entity {
482					sprintf "&#x%X;", ord($_[0]);
483					}
484
485					# Set up aliases
486	1	10µs			*encode = \&encode_entities;
487	1	5µs			*encode_numeric = \&encode_entities_numeric;
488	1	5µs			*encode_numerically = \&encode_entities_numeric;
489	1	5µs			*decode = \&decode_entities;
490
491	1	125µs			1;
					# spent 18µs within HTML::Entities::CORE:match which was called # once (18µs+0s) by LWP::Protocol::collect at line 144 of HTML/Entities.pm sub HTML::Entities::CORE:match; # xsub
					# spent 1.27ms within HTML::Entities::CORE:subst which was called 253 times, avg 5µs/call: # 253 times (1.27ms+0s) by LWP::Protocol::collect at line 419 of HTML/Entities.pm, avg 5µs/call sub HTML::Entities::CORE:subst; # xsub