| File | /project/perl/lib/HTML/Entities.pm |
| Statements Executed | 944 |
| Statement Execution Time | 16.1ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 253 | 1 | 2 | 1.27ms | 1.27ms | HTML::Entities::CORE:subst (opcode) |
| 1 | 1 | 2 | 18µs | 18µs | HTML::Entities::CORE:match (opcode) |
| 0 | 0 | 0 | 0s | 0s | HTML::Entities::BEGIN |
| 0 | 0 | 0 | 0s | 0s | HTML::Entities::Version |
| 0 | 0 | 0 | 0s | 0s | HTML::Entities::decode_entities_old |
| 0 | 0 | 0 | 0s | 0s | HTML::Entities::encode_entities |
| 0 | 0 | 0 | 0s | 0s | HTML::Entities::encode_entities_numeric |
| 0 | 0 | 0 | 0s | 0s | HTML::Entities::num_entity |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package HTML::Entities; | ||||
| 2 | |||||
| 3 | # $Id: Entities.pm,v 1.35 2006/03/22 09:15:23 gisle Exp $ | ||||
| 4 | |||||
| 5 | =head1 NAME | ||||
| 6 | |||||
| 7 | HTML::Entities - Encode or decode strings with HTML entities | ||||
| 8 | |||||
| 9 | =head1 SYNOPSIS | ||||
| 10 | |||||
| 11 | use HTML::Entities; | ||||
| 12 | |||||
| 13 | $a = "Våre norske tegn bør æres"; | ||||
| 14 | decode_entities($a); | ||||
| 15 | encode_entities($a, "\200-\377"); | ||||
| 16 | |||||
| 17 | For example, this: | ||||
| 18 | |||||
| 19 | $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé"; | ||||
| 20 | print encode_entities($input), "\n" | ||||
| 21 | |||||
| 22 | Prints this out: | ||||
| 23 | |||||
| 24 | vis-à-vis Beyoncé's naïve | ||||
| 25 | papier-mâché résumé | ||||
| 26 | |||||
| 27 | =head1 DESCRIPTION | ||||
| 28 | |||||
| 29 | This module deals with encoding and decoding of strings with HTML | ||||
| 30 | character entities. The module provides the following functions: | ||||
| 31 | |||||
| 32 | =over 4 | ||||
| 33 | |||||
| 34 | =item decode_entities( $string, ... ) | ||||
| 35 | |||||
| 36 | This routine replaces HTML entities found in the $string with the | ||||
| 37 | corresponding Unicode character. Under perl 5.6 and earlier only | ||||
| 38 | characters in the Latin-1 range are replaced. Unrecognized | ||||
| 39 | entities are left alone. | ||||
| 40 | |||||
| 41 | If multiple strings are provided as argument they are each decoded | ||||
| 42 | separately and the same number of strings are returned. | ||||
| 43 | |||||
| 44 | If called in void context the arguments are decoded in-place. | ||||
| 45 | |||||
| 46 | This routine is exported by default. | ||||
| 47 | |||||
| 48 | =item _decode_entities( $string, \%entity2char ) | ||||
| 49 | |||||
| 50 | =item _decode_entities( $string, \%entity2char, $expand_prefix ) | ||||
| 51 | |||||
| 52 | This will in-place replace HTML entities in $string. The %entity2char | ||||
| 53 | hash must be provided. Named entities not found in the %entity2char | ||||
| 54 | hash are left alone. Numeric entities are expanded unless their value | ||||
| 55 | overflow. | ||||
| 56 | |||||
| 57 | The keys in %entity2char are the entity names to be expanded and their | ||||
| 58 | values are what they should expand into. The values do not have to be | ||||
| 59 | single character strings. If a key has ";" as suffix, | ||||
| 60 | then occurrences in $string are only expanded if properly terminated | ||||
| 61 | with ";". Entities without ";" will be expanded regardless of how | ||||
| 62 | they are terminated for compatiblity with how common browsers treat | ||||
| 63 | entities in the Latin-1 range. | ||||
| 64 | |||||
| 65 | If $expand_prefix is TRUE then entities without trailing ";" in | ||||
| 66 | %entity2char will even be expanded as a prefix of a longer | ||||
| 67 | unrecognized name. The longest matching name in %entity2char will be | ||||
| 68 | used. This is mainly present for compatibility with an MSIE | ||||
| 69 | misfeature. | ||||
| 70 | |||||
| 71 | $string = "foo bar"; | ||||
| 72 | _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1); | ||||
| 73 | print $string; # will print "foo bar" | ||||
| 74 | |||||
| 75 | This routine is exported by default. | ||||
| 76 | |||||
| 77 | =item encode_entities( $string ) | ||||
| 78 | |||||
| 79 | =item encode_entities( $string, $unsafe_chars ) | ||||
| 80 | |||||
| 81 | This routine replaces unsafe characters in $string with their entity | ||||
| 82 | representation. A second argument can be given to specify which | ||||
| 83 | characters to consider unsafe (i.e., which to escape). The default set | ||||
| 84 | of characters to encode are control chars, high-bit chars, and the | ||||
| 85 | C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> | ||||
| 86 | characters. But this, for example, would encode I<just> the | ||||
| 87 | C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters: | ||||
| 88 | |||||
| 89 | $encoded = encode_entities($input, '<>&"'); | ||||
| 90 | |||||
| 91 | This routine is exported by default. | ||||
| 92 | |||||
| 93 | =item encode_entities_numeric( $string ) | ||||
| 94 | |||||
| 95 | =item encode_entities_numeric( $string, $unsafe_chars ) | ||||
| 96 | |||||
| 97 | This routine works just like encode_entities, except that the replacement | ||||
| 98 | entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For | ||||
| 99 | example, C<encode_entities("r\xF4le")> returns "rôle", but | ||||
| 100 | C<encode_entities_numeric("r\xF4le")> returns "rôle". | ||||
| 101 | |||||
| 102 | This routine is I<not> exported by default. But you can always | ||||
| 103 | export it with C<use HTML::Entities qw(encode_entities_numeric);> | ||||
| 104 | or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);> | ||||
| 105 | |||||
| 106 | =back | ||||
| 107 | |||||
| 108 | All these routines modify the string passed as the first argument, if | ||||
| 109 | called in a void context. In scalar and array contexts, the encoded or | ||||
| 110 | decoded string is returned (without changing the input string). | ||||
| 111 | |||||
| 112 | If you prefer not to import these routines into your namespace, you can | ||||
| 113 | call them as: | ||||
| 114 | |||||
| 115 | use HTML::Entities (); | ||||
| 116 | $decoded = HTML::Entities::decode($a); | ||||
| 117 | $encoded = HTML::Entities::encode($a); | ||||
| 118 | $encoded = HTML::Entities::encode_numeric($a); | ||||
| 119 | |||||
| 120 | The module can also export the %char2entity and the %entity2char | ||||
| 121 | hashes, which contain the mapping from all characters to the | ||||
| 122 | corresponding entities (and vice versa, respectively). | ||||
| 123 | |||||
| 124 | =head1 COPYRIGHT | ||||
| 125 | |||||
| 126 | Copyright 1995-2006 Gisle Aas. All rights reserved. | ||||
| 127 | |||||
| 128 | This library is free software; you can redistribute it and/or | ||||
| 129 | modify it under the same terms as Perl itself. | ||||
| 130 | |||||
| 131 | =cut | ||||
| 132 | |||||
| 133 | 3 | 111µs | 1 | 44µs | use strict; # spent 44µs making 1 call to strict::import |
| 134 | 3 | 94µs | 1 | 362µs | use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # spent 362µs making 1 call to vars::import |
| 135 | 3 | 2.57ms | 1 | 215µs | use vars qw(%entity2char %char2entity); # spent 215µs making 1 call to vars::import |
| 136 | |||||
| 137 | 1 | 5µs | require 5.004; | ||
| 138 | 1 | 7µs | require Exporter; | ||
| 139 | 1 | 12µs | @ISA = qw(Exporter); | ||
| 140 | |||||
| 141 | 1 | 6µs | @EXPORT = qw(encode_entities decode_entities _decode_entities); | ||
| 142 | 1 | 5µs | @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); | ||
| 143 | |||||
| 144 | 1 | 63µs | 1 | 18µs | $VERSION = sprintf("%d.%02d", q$Revision: 1.35 $ =~ /(\d+)\.(\d+)/); # spent 18µs making 1 call to HTML::Entities::CORE:match |
| 145 | sub Version { $VERSION; } | ||||
| 146 | |||||
| 147 | 1 | 729µs | require HTML::Parser; # for fast XS implemented decode_entities | ||
| 148 | |||||
| 149 | |||||
| 150 | 1 | 1.23ms | %entity2char = ( | ||
| 151 | # Some normal chars that have special meaning in SGML context | ||||
| 152 | amp => '&', # ampersand | ||||
| 153 | 'gt' => '>', # greater than | ||||
| 154 | 'lt' => '<', # less than | ||||
| 155 | quot => '"', # double quote | ||||
| 156 | apos => "'", # single quote | ||||
| 157 | |||||
| 158 | # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML | ||||
| 159 | AElig => chr(198), # capital AE diphthong (ligature) | ||||
| 160 | Aacute => chr(193), # capital A, acute accent | ||||
| 161 | Acirc => chr(194), # capital A, circumflex accent | ||||
| 162 | Agrave => chr(192), # capital A, grave accent | ||||
| 163 | Aring => chr(197), # capital A, ring | ||||
| 164 | Atilde => chr(195), # capital A, tilde | ||||
| 165 | Auml => chr(196), # capital A, dieresis or umlaut mark | ||||
| 166 | Ccedil => chr(199), # capital C, cedilla | ||||
| 167 | ETH => chr(208), # capital Eth, Icelandic | ||||
| 168 | Eacute => chr(201), # capital E, acute accent | ||||
| 169 | Ecirc => chr(202), # capital E, circumflex accent | ||||
| 170 | Egrave => chr(200), # capital E, grave accent | ||||
| 171 | Euml => chr(203), # capital E, dieresis or umlaut mark | ||||
| 172 | Iacute => chr(205), # capital I, acute accent | ||||
| 173 | Icirc => chr(206), # capital I, circumflex accent | ||||
| 174 | Igrave => chr(204), # capital I, grave accent | ||||
| 175 | Iuml => chr(207), # capital I, dieresis or umlaut mark | ||||
| 176 | Ntilde => chr(209), # capital N, tilde | ||||
| 177 | Oacute => chr(211), # capital O, acute accent | ||||
| 178 | Ocirc => chr(212), # capital O, circumflex accent | ||||
| 179 | Ograve => chr(210), # capital O, grave accent | ||||
| 180 | Oslash => chr(216), # capital O, slash | ||||
| 181 | Otilde => chr(213), # capital O, tilde | ||||
| 182 | Ouml => chr(214), # capital O, dieresis or umlaut mark | ||||
| 183 | THORN => chr(222), # capital THORN, Icelandic | ||||
| 184 | Uacute => chr(218), # capital U, acute accent | ||||
| 185 | Ucirc => chr(219), # capital U, circumflex accent | ||||
| 186 | Ugrave => chr(217), # capital U, grave accent | ||||
| 187 | Uuml => chr(220), # capital U, dieresis or umlaut mark | ||||
| 188 | Yacute => chr(221), # capital Y, acute accent | ||||
| 189 | aacute => chr(225), # small a, acute accent | ||||
| 190 | acirc => chr(226), # small a, circumflex accent | ||||
| 191 | aelig => chr(230), # small ae diphthong (ligature) | ||||
| 192 | agrave => chr(224), # small a, grave accent | ||||
| 193 | aring => chr(229), # small a, ring | ||||
| 194 | atilde => chr(227), # small a, tilde | ||||
| 195 | auml => chr(228), # small a, dieresis or umlaut mark | ||||
| 196 | ccedil => chr(231), # small c, cedilla | ||||
| 197 | eacute => chr(233), # small e, acute accent | ||||
| 198 | ecirc => chr(234), # small e, circumflex accent | ||||
| 199 | egrave => chr(232), # small e, grave accent | ||||
| 200 | eth => chr(240), # small eth, Icelandic | ||||
| 201 | euml => chr(235), # small e, dieresis or umlaut mark | ||||
| 202 | iacute => chr(237), # small i, acute accent | ||||
| 203 | icirc => chr(238), # small i, circumflex accent | ||||
| 204 | igrave => chr(236), # small i, grave accent | ||||
| 205 | iuml => chr(239), # small i, dieresis or umlaut mark | ||||
| 206 | ntilde => chr(241), # small n, tilde | ||||
| 207 | oacute => chr(243), # small o, acute accent | ||||
| 208 | ocirc => chr(244), # small o, circumflex accent | ||||
| 209 | ograve => chr(242), # small o, grave accent | ||||
| 210 | oslash => chr(248), # small o, slash | ||||
| 211 | otilde => chr(245), # small o, tilde | ||||
| 212 | ouml => chr(246), # small o, dieresis or umlaut mark | ||||
| 213 | szlig => chr(223), # small sharp s, German (sz ligature) | ||||
| 214 | thorn => chr(254), # small thorn, Icelandic | ||||
| 215 | uacute => chr(250), # small u, acute accent | ||||
| 216 | ucirc => chr(251), # small u, circumflex accent | ||||
| 217 | ugrave => chr(249), # small u, grave accent | ||||
| 218 | uuml => chr(252), # small u, dieresis or umlaut mark | ||||
| 219 | yacute => chr(253), # small y, acute accent | ||||
| 220 | yuml => chr(255), # small y, dieresis or umlaut mark | ||||
| 221 | |||||
| 222 | # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) | ||||
| 223 | copy => chr(169), # copyright sign | ||||
| 224 | reg => chr(174), # registered sign | ||||
| 225 | nbsp => chr(160), # non breaking space | ||||
| 226 | |||||
| 227 | # Additional ISO-8859/1 entities listed in rfc1866 (section 14) | ||||
| 228 | iexcl => chr(161), | ||||
| 229 | cent => chr(162), | ||||
| 230 | pound => chr(163), | ||||
| 231 | curren => chr(164), | ||||
| 232 | yen => chr(165), | ||||
| 233 | brvbar => chr(166), | ||||
| 234 | sect => chr(167), | ||||
| 235 | uml => chr(168), | ||||
| 236 | ordf => chr(170), | ||||
| 237 | laquo => chr(171), | ||||
| 238 | 'not' => chr(172), # not is a keyword in perl | ||||
| 239 | shy => chr(173), | ||||
| 240 | macr => chr(175), | ||||
| 241 | deg => chr(176), | ||||
| 242 | plusmn => chr(177), | ||||
| 243 | sup1 => chr(185), | ||||
| 244 | sup2 => chr(178), | ||||
| 245 | sup3 => chr(179), | ||||
| 246 | acute => chr(180), | ||||
| 247 | micro => chr(181), | ||||
| 248 | para => chr(182), | ||||
| 249 | middot => chr(183), | ||||
| 250 | cedil => chr(184), | ||||
| 251 | ordm => chr(186), | ||||
| 252 | raquo => chr(187), | ||||
| 253 | frac14 => chr(188), | ||||
| 254 | frac12 => chr(189), | ||||
| 255 | frac34 => chr(190), | ||||
| 256 | iquest => chr(191), | ||||
| 257 | 'times' => chr(215), # times is a keyword in perl | ||||
| 258 | divide => chr(247), | ||||
| 259 | |||||
| 260 | ( $] > 5.007 ? ( | ||||
| 261 | 'OElig;' => chr(338), | ||||
| 262 | 'oelig;' => chr(339), | ||||
| 263 | 'Scaron;' => chr(352), | ||||
| 264 | 'scaron;' => chr(353), | ||||
| 265 | 'Yuml;' => chr(376), | ||||
| 266 | 'fnof;' => chr(402), | ||||
| 267 | 'circ;' => chr(710), | ||||
| 268 | 'tilde;' => chr(732), | ||||
| 269 | 'Alpha;' => chr(913), | ||||
| 270 | 'Beta;' => chr(914), | ||||
| 271 | 'Gamma;' => chr(915), | ||||
| 272 | 'Delta;' => chr(916), | ||||
| 273 | 'Epsilon;' => chr(917), | ||||
| 274 | 'Zeta;' => chr(918), | ||||
| 275 | 'Eta;' => chr(919), | ||||
| 276 | 'Theta;' => chr(920), | ||||
| 277 | 'Iota;' => chr(921), | ||||
| 278 | 'Kappa;' => chr(922), | ||||
| 279 | 'Lambda;' => chr(923), | ||||
| 280 | 'Mu;' => chr(924), | ||||
| 281 | 'Nu;' => chr(925), | ||||
| 282 | 'Xi;' => chr(926), | ||||
| 283 | 'Omicron;' => chr(927), | ||||
| 284 | 'Pi;' => chr(928), | ||||
| 285 | 'Rho;' => chr(929), | ||||
| 286 | 'Sigma;' => chr(931), | ||||
| 287 | 'Tau;' => chr(932), | ||||
| 288 | 'Upsilon;' => chr(933), | ||||
| 289 | 'Phi;' => chr(934), | ||||
| 290 | 'Chi;' => chr(935), | ||||
| 291 | 'Psi;' => chr(936), | ||||
| 292 | 'Omega;' => chr(937), | ||||
| 293 | 'alpha;' => chr(945), | ||||
| 294 | 'beta;' => chr(946), | ||||
| 295 | 'gamma;' => chr(947), | ||||
| 296 | 'delta;' => chr(948), | ||||
| 297 | 'epsilon;' => chr(949), | ||||
| 298 | 'zeta;' => chr(950), | ||||
| 299 | 'eta;' => chr(951), | ||||
| 300 | 'theta;' => chr(952), | ||||
| 301 | 'iota;' => chr(953), | ||||
| 302 | 'kappa;' => chr(954), | ||||
| 303 | 'lambda;' => chr(955), | ||||
| 304 | 'mu;' => chr(956), | ||||
| 305 | 'nu;' => chr(957), | ||||
| 306 | 'xi;' => chr(958), | ||||
| 307 | 'omicron;' => chr(959), | ||||
| 308 | 'pi;' => chr(960), | ||||
| 309 | 'rho;' => chr(961), | ||||
| 310 | 'sigmaf;' => chr(962), | ||||
| 311 | 'sigma;' => chr(963), | ||||
| 312 | 'tau;' => chr(964), | ||||
| 313 | 'upsilon;' => chr(965), | ||||
| 314 | 'phi;' => chr(966), | ||||
| 315 | 'chi;' => chr(967), | ||||
| 316 | 'psi;' => chr(968), | ||||
| 317 | 'omega;' => chr(969), | ||||
| 318 | 'thetasym;' => chr(977), | ||||
| 319 | 'upsih;' => chr(978), | ||||
| 320 | 'piv;' => chr(982), | ||||
| 321 | 'ensp;' => chr(8194), | ||||
| 322 | 'emsp;' => chr(8195), | ||||
| 323 | 'thinsp;' => chr(8201), | ||||
| 324 | 'zwnj;' => chr(8204), | ||||
| 325 | 'zwj;' => chr(8205), | ||||
| 326 | 'lrm;' => chr(8206), | ||||
| 327 | 'rlm;' => chr(8207), | ||||
| 328 | 'ndash;' => chr(8211), | ||||
| 329 | 'mdash;' => chr(8212), | ||||
| 330 | 'lsquo;' => chr(8216), | ||||
| 331 | 'rsquo;' => chr(8217), | ||||
| 332 | 'sbquo;' => chr(8218), | ||||
| 333 | 'ldquo;' => chr(8220), | ||||
| 334 | 'rdquo;' => chr(8221), | ||||
| 335 | 'bdquo;' => chr(8222), | ||||
| 336 | 'dagger;' => chr(8224), | ||||
| 337 | 'Dagger;' => chr(8225), | ||||
| 338 | 'bull;' => chr(8226), | ||||
| 339 | 'hellip;' => chr(8230), | ||||
| 340 | 'permil;' => chr(8240), | ||||
| 341 | 'prime;' => chr(8242), | ||||
| 342 | 'Prime;' => chr(8243), | ||||
| 343 | 'lsaquo;' => chr(8249), | ||||
| 344 | 'rsaquo;' => chr(8250), | ||||
| 345 | 'oline;' => chr(8254), | ||||
| 346 | 'frasl;' => chr(8260), | ||||
| 347 | 'euro;' => chr(8364), | ||||
| 348 | 'image;' => chr(8465), | ||||
| 349 | 'weierp;' => chr(8472), | ||||
| 350 | 'real;' => chr(8476), | ||||
| 351 | 'trade;' => chr(8482), | ||||
| 352 | 'alefsym;' => chr(8501), | ||||
| 353 | 'larr;' => chr(8592), | ||||
| 354 | 'uarr;' => chr(8593), | ||||
| 355 | 'rarr;' => chr(8594), | ||||
| 356 | 'darr;' => chr(8595), | ||||
| 357 | 'harr;' => chr(8596), | ||||
| 358 | 'crarr;' => chr(8629), | ||||
| 359 | 'lArr;' => chr(8656), | ||||
| 360 | 'uArr;' => chr(8657), | ||||
| 361 | 'rArr;' => chr(8658), | ||||
| 362 | 'dArr;' => chr(8659), | ||||
| 363 | 'hArr;' => chr(8660), | ||||
| 364 | 'forall;' => chr(8704), | ||||
| 365 | 'part;' => chr(8706), | ||||
| 366 | 'exist;' => chr(8707), | ||||
| 367 | 'empty;' => chr(8709), | ||||
| 368 | 'nabla;' => chr(8711), | ||||
| 369 | 'isin;' => chr(8712), | ||||
| 370 | 'notin;' => chr(8713), | ||||
| 371 | 'ni;' => chr(8715), | ||||
| 372 | 'prod;' => chr(8719), | ||||
| 373 | 'sum;' => chr(8721), | ||||
| 374 | 'minus;' => chr(8722), | ||||
| 375 | 'lowast;' => chr(8727), | ||||
| 376 | 'radic;' => chr(8730), | ||||
| 377 | 'prop;' => chr(8733), | ||||
| 378 | 'infin;' => chr(8734), | ||||
| 379 | 'ang;' => chr(8736), | ||||
| 380 | 'and;' => chr(8743), | ||||
| 381 | 'or;' => chr(8744), | ||||
| 382 | 'cap;' => chr(8745), | ||||
| 383 | 'cup;' => chr(8746), | ||||
| 384 | 'int;' => chr(8747), | ||||
| 385 | 'there4;' => chr(8756), | ||||
| 386 | 'sim;' => chr(8764), | ||||
| 387 | 'cong;' => chr(8773), | ||||
| 388 | 'asymp;' => chr(8776), | ||||
| 389 | 'ne;' => chr(8800), | ||||
| 390 | 'equiv;' => chr(8801), | ||||
| 391 | 'le;' => chr(8804), | ||||
| 392 | 'ge;' => chr(8805), | ||||
| 393 | 'sub;' => chr(8834), | ||||
| 394 | 'sup;' => chr(8835), | ||||
| 395 | 'nsub;' => chr(8836), | ||||
| 396 | 'sube;' => chr(8838), | ||||
| 397 | 'supe;' => chr(8839), | ||||
| 398 | 'oplus;' => chr(8853), | ||||
| 399 | 'otimes;' => chr(8855), | ||||
| 400 | 'perp;' => chr(8869), | ||||
| 401 | 'sdot;' => chr(8901), | ||||
| 402 | 'lceil;' => chr(8968), | ||||
| 403 | 'rceil;' => chr(8969), | ||||
| 404 | 'lfloor;' => chr(8970), | ||||
| 405 | 'rfloor;' => chr(8971), | ||||
| 406 | 'lang;' => chr(9001), | ||||
| 407 | 'rang;' => chr(9002), | ||||
| 408 | 'loz;' => chr(9674), | ||||
| 409 | 'spades;' => chr(9824), | ||||
| 410 | 'clubs;' => chr(9827), | ||||
| 411 | 'hearts;' => chr(9829), | ||||
| 412 | 'diams;' => chr(9830), | ||||
| 413 | ) : ()) | ||||
| 414 | ); | ||||
| 415 | |||||
| 416 | |||||
| 417 | # Make the opposite mapping | ||||
| 418 | 1 | 20µs | while (my($entity, $char) = each(%entity2char)) { | ||
| 419 | 253 | 4.28ms | 253 | 1.27ms | $entity =~ s/;\z//; # spent 1.27ms making 253 calls to HTML::Entities::CORE:subst, avg 5µs/call |
| 420 | 253 | 3.63ms | $char2entity{$char} = "&$entity;"; | ||
| 421 | } | ||||
| 422 | 1 | 7µs | delete $char2entity{"'"}; # only one-way decoding | ||
| 423 | |||||
| 424 | # Fill in missing entities | ||||
| 425 | 1 | 438µs | for (0 .. 255) { | ||
| 426 | 256 | 1.12ms | next if exists $char2entity{chr($_)}; | ||
| 427 | 156 | 1.57ms | $char2entity{chr($_)} = "&#$_;"; | ||
| 428 | } | ||||
| 429 | |||||
| 430 | 1 | 5µs | my %subst; # compiled encoding regexps | ||
| 431 | |||||
| 432 | sub decode_entities_old | ||||
| 433 | { | ||||
| 434 | my $array; | ||||
| 435 | if (defined wantarray) { | ||||
| 436 | $array = [@_]; # copy | ||||
| 437 | } else { | ||||
| 438 | $array = \@_; # modify in-place | ||||
| 439 | } | ||||
| 440 | my $c; | ||||
| 441 | for (@$array) { | ||||
| 442 | s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg; | ||||
| 443 | s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg; | ||||
| 444 | s/(&(\w+);?)/$entity2char{$2} || $1/eg; | ||||
| 445 | } | ||||
| 446 | wantarray ? @$array : $array->[0]; | ||||
| 447 | } | ||||
| 448 | |||||
| 449 | sub encode_entities | ||||
| 450 | { | ||||
| 451 | my $ref; | ||||
| 452 | if (defined wantarray) { | ||||
| 453 | my $x = $_[0]; | ||||
| 454 | $ref = \$x; # copy | ||||
| 455 | } else { | ||||
| 456 | $ref = \$_[0]; # modify in-place | ||||
| 457 | } | ||||
| 458 | if (defined $_[1] and length $_[1]) { | ||||
| 459 | unless (exists $subst{$_[1]}) { | ||||
| 460 | # Because we can't compile regex we fake it with a cached sub | ||||
| 461 | my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }"; | ||||
| 462 | $subst{$_[1]} = eval $code; | ||||
| 463 | die( $@ . " while trying to turn range: \"$_[1]\"\n " | ||||
| 464 | . "into code: $code\n " | ||||
| 465 | ) if $@; | ||||
| 466 | } | ||||
| 467 | &{$subst{$_[1]}}($$ref); | ||||
| 468 | } else { | ||||
| 469 | # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' | ||||
| 470 | $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge; | ||||
| 471 | } | ||||
| 472 | $$ref; | ||||
| 473 | } | ||||
| 474 | |||||
| 475 | sub encode_entities_numeric { | ||||
| 476 | local %char2entity; | ||||
| 477 | return &encode_entities; # a goto &encode_entities wouldn't work | ||||
| 478 | } | ||||
| 479 | |||||
| 480 | |||||
| 481 | sub num_entity { | ||||
| 482 | sprintf "&#x%X;", ord($_[0]); | ||||
| 483 | } | ||||
| 484 | |||||
| 485 | # Set up aliases | ||||
| 486 | 1 | 10µs | *encode = \&encode_entities; | ||
| 487 | 1 | 5µs | *encode_numeric = \&encode_entities_numeric; | ||
| 488 | 1 | 5µs | *encode_numerically = \&encode_entities_numeric; | ||
| 489 | 1 | 5µs | *decode = \&decode_entities; | ||
| 490 | |||||
| 491 | 1 | 125µs | 1; | ||
# spent 18µs within HTML::Entities::CORE:match which was called
# once (18µs+0s) by LWP::Protocol::collect at line 144 of HTML/Entities.pm | |||||
# spent 1.27ms within HTML::Entities::CORE:subst which was called 253 times, avg 5µs/call:
# 253 times (1.27ms+0s) by LWP::Protocol::collect at line 419 of HTML/Entities.pm, avg 5µs/call |