File | /project/perl/lib/URI/Escape.pm |
Statements Executed | 7652 |
Statement Execution Time | 143ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
461 | 1 | 1 | 89.3ms | 120ms | uri_escape | URI::Escape::
3688 | 1 | 2 | 24.7ms | 24.7ms | CORE:substcont (opcode) | URI::Escape::
461 | 1 | 1 | 17.0ms | 19.3ms | uri_unescape | URI::Escape::
922 | 2 | 2 | 8.43ms | 8.43ms | CORE:subst (opcode) | URI::Escape::
1 | 1 | 2 | 19µs | 19µs | CORE:match (opcode) | URI::Escape::
0 | 0 | 0 | 0s | 0s | BEGIN | URI::Escape::
0 | 0 | 0 | 0s | 0s | _fail_hi | URI::Escape::
0 | 0 | 0 | 0s | 0s | uri_escape_utf8 | URI::Escape::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | # | ||||
2 | # $Id: Escape.pm,v 3.28 2004/11/05 13:58:31 gisle Exp $ | ||||
3 | # | ||||
4 | |||||
5 | package URI::Escape; | ||||
6 | 3 | 171µs | 1 | 28µs | use strict; # spent 28µs making 1 call to strict::import |
7 | |||||
8 | =head1 NAME | ||||
9 | |||||
10 | URI::Escape - Escape and unescape unsafe characters | ||||
11 | |||||
12 | =head1 SYNOPSIS | ||||
13 | |||||
14 | use URI::Escape; | ||||
15 | $safe = uri_escape("10% is enough\n"); | ||||
16 | $verysafe = uri_escape("foo", "\0-\377"); | ||||
17 | $str = uri_unescape($safe); | ||||
18 | |||||
19 | =head1 DESCRIPTION | ||||
20 | |||||
21 | This module provides functions to escape and unescape URI strings as | ||||
22 | defined by RFC 2396 (and updated by RFC 2732). | ||||
23 | A URI consists of a restricted set of characters, | ||||
24 | denoted as C<uric> in RFC 2396. The restricted set of characters | ||||
25 | consists of digits, letters, and a few graphic symbols chosen from | ||||
26 | those common to most of the character encodings and input facilities | ||||
27 | available to Internet users: | ||||
28 | |||||
29 | "A" .. "Z", "a" .. "z", "0" .. "9", | ||||
30 | ";", "/", "?", ":", "@", "&", "=", "+", "$", ",", "[", "]", # reserved | ||||
31 | "-", "_", ".", "!", "~", "*", "'", "(", ")" | ||||
32 | |||||
33 | In addition, any byte (octet) can be represented in a URI by an escape | ||||
34 | sequence: a triplet consisting of the character "%" followed by two | ||||
35 | hexadecimal digits. A byte can also be represented directly by a | ||||
36 | character, using the US-ASCII character for that octet (iff the | ||||
37 | character is part of C<uric>). | ||||
38 | |||||
39 | Some of the C<uric> characters are I<reserved> for use as delimiters | ||||
40 | or as part of certain URI components. These must be escaped if they are | ||||
41 | to be treated as ordinary data. Read RFC 2396 for further details. | ||||
42 | |||||
43 | The functions provided (and exported by default) from this module are: | ||||
44 | |||||
45 | =over 4 | ||||
46 | |||||
47 | =item uri_escape( $string ) | ||||
48 | |||||
49 | =item uri_escape( $string, $unsafe ) | ||||
50 | |||||
51 | Replaces each unsafe character in the $string with the corresponding | ||||
52 | escape sequence and returns the result. The $string argument should | ||||
53 | be a string of bytes. The uri_escape() function will croak if given a | ||||
54 | characters with code above 255. Use uri_escape_utf8() if you know you | ||||
55 | have such chars or/and want chars in the 128 .. 255 range treated as | ||||
56 | UTF-8. | ||||
57 | |||||
58 | The uri_escape() function takes an optional second argument that | ||||
59 | overrides the set of characters that are to be escaped. The set is | ||||
60 | specified as a string that can be used in a regular expression | ||||
61 | character class (between [ ]). E.g.: | ||||
62 | |||||
63 | "\x00-\x1f\x7f-\xff" # all control and hi-bit characters | ||||
64 | "a-z" # all lower case characters | ||||
65 | "^A-Za-z" # everything not a letter | ||||
66 | |||||
67 | The default set of characters to be escaped is all those which are | ||||
68 | I<not> part of the C<uric> character class shown above as well as the | ||||
69 | reserved characters. I.e. the default is: | ||||
70 | |||||
71 | "^A-Za-z0-9\-_.!~*'()" | ||||
72 | |||||
73 | =item uri_escape_utf8( $string ) | ||||
74 | |||||
75 | =item uri_escape_utf8( $string, $unsafe ) | ||||
76 | |||||
77 | Works like uri_escape(), but will encode chars as UTF-8 before | ||||
78 | escaping them. This makes this function able do deal with characters | ||||
79 | with code above 255 in $string. Note that chars in the 128 .. 255 | ||||
80 | range will be escaped differently by this function compared to what | ||||
81 | uri_escape() would. For chars in the 0 .. 127 range there is no | ||||
82 | difference. | ||||
83 | |||||
84 | The call: | ||||
85 | |||||
86 | $uri = uri_escape_utf8($string); | ||||
87 | |||||
88 | will be the same as: | ||||
89 | |||||
90 | use Encode qw(encode); | ||||
91 | $uri = uri_escape(encode("UTF-8", $string)); | ||||
92 | |||||
93 | but will even work for perl-5.6 for chars in the 128 .. 255 range. | ||||
94 | |||||
95 | Note: Javascript has a function called escape() that produce the | ||||
96 | sequence "%uXXXX" for chars in the 256 .. 65535 range. This function | ||||
97 | has really nothing to do with URI escaping but some folks got confused | ||||
98 | since it "does the right thing" in the 0 .. 255 range. Because of | ||||
99 | this you sometimes see "URIs" with these kind of escapes. The | ||||
100 | JavaScript encodeURI() function is similar to uri_escape_utf8(). | ||||
101 | |||||
102 | =item uri_unescape($string,...) | ||||
103 | |||||
104 | Returns a string with each %XX sequence replaced with the actual byte | ||||
105 | (octet). | ||||
106 | |||||
107 | This does the same as: | ||||
108 | |||||
109 | $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; | ||||
110 | |||||
111 | but does not modify the string in-place as this RE would. Using the | ||||
112 | uri_unescape() function instead of the RE might make the code look | ||||
113 | cleaner and is a few characters less to type. | ||||
114 | |||||
115 | In a simple benchmark test I did, | ||||
116 | calling the function (instead of the inline RE above) if a few chars | ||||
117 | were unescaped was something like 40% slower, and something like 700% slower if none were. If | ||||
118 | you are going to unescape a lot of times it might be a good idea to | ||||
119 | inline the RE. | ||||
120 | |||||
121 | If the uri_unescape() function is passed multiple strings, then each | ||||
122 | one is returned unescaped. | ||||
123 | |||||
124 | =back | ||||
125 | |||||
126 | The module can also export the C<%escapes> hash, which contains the | ||||
127 | mapping from all 256 bytes to the corresponding escape codes. Lookup | ||||
128 | in this hash is faster than evaluating C<sprintf("%%%02X", ord($byte))> | ||||
129 | each time. | ||||
130 | |||||
131 | =head1 SEE ALSO | ||||
132 | |||||
133 | L<URI> | ||||
134 | |||||
135 | |||||
136 | =head1 COPYRIGHT | ||||
137 | |||||
138 | Copyright 1995-2004 Gisle Aas. | ||||
139 | |||||
140 | This program is free software; you can redistribute it and/or modify | ||||
141 | it under the same terms as Perl itself. | ||||
142 | |||||
143 | =cut | ||||
144 | |||||
145 | 3 | 99µs | 1 | 385µs | use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # spent 385µs making 1 call to vars::import |
146 | 3 | 199µs | 1 | 131µs | use vars qw(%escapes); # spent 131µs making 1 call to vars::import |
147 | |||||
148 | 1 | 6µs | require Exporter; | ||
149 | 1 | 12µs | @ISA = qw(Exporter); | ||
150 | 1 | 6µs | @EXPORT = qw(uri_escape uri_unescape); | ||
151 | 1 | 5µs | @EXPORT_OK = qw(%escapes uri_escape_utf8); | ||
152 | 1 | 58µs | 1 | 19µs | $VERSION = sprintf("%d.%02d", q$Revision: 3.28 $ =~ /(\d+)\.(\d+)/); # spent 19µs making 1 call to URI::Escape::CORE:match |
153 | |||||
154 | 3 | 912µs | use Carp (); | ||
155 | |||||
156 | # Build a char->hex map | ||||
157 | 1 | 13µs | for (0..255) { | ||
158 | 256 | 2.64ms | $escapes{chr($_)} = sprintf("%%%02X", $_); | ||
159 | } | ||||
160 | |||||
161 | 1 | 5µs | my %subst; # compiled patternes | ||
162 | |||||
163 | sub uri_escape | ||||
164 | # spent 120ms (89.3+30.8) within URI::Escape::uri_escape which was called 461 times, avg 261µs/call:
# 461 times (89.3ms+30.8ms) by WWW::Google::PageRank::get at line 33 of WWW/Google/PageRank.pm, avg 261µs/call | ||||
165 | 461 | 2.96ms | my($text, $patn) = @_; | ||
166 | 461 | 2.28ms | return undef unless defined $text; | ||
167 | 461 | 5.08ms | if (defined $patn){ | ||
168 | unless (exists $subst{$patn}) { | ||||
169 | # Because we can't compile the regex we fake it with a cached sub | ||||
170 | (my $tmp = $patn) =~ s,/,\\/,g; | ||||
171 | eval "\$subst{\$patn} = sub {\$_[0] =~ s/([$tmp])/\$escapes{\$1} || _fail_hi(\$1)/ge; }"; | ||||
172 | Carp::croak("uri_escape: $@") if $@; | ||||
173 | } | ||||
174 | &{$subst{$patn}}($text); | ||||
175 | } else { | ||||
176 | # Default unsafe characters. RFC 2732 ^(uric - reserved) | ||||
177 | 3688 | 103ms | 4149 | 30.8ms | $text =~ s/([^A-Za-z0-9\-_.!~*'()])/$escapes{$1} || _fail_hi($1)/ge; # spent 24.7ms making 3688 calls to URI::Escape::CORE:substcont, avg 7µs/call
# spent 6.08ms making 461 calls to URI::Escape::CORE:subst, avg 13µs/call |
178 | } | ||||
179 | 461 | 6.98ms | $text; | ||
180 | } | ||||
181 | |||||
182 | sub _fail_hi { | ||||
183 | my $chr = shift; | ||||
184 | Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr)); | ||||
185 | } | ||||
186 | |||||
187 | sub uri_escape_utf8 | ||||
188 | { | ||||
189 | my $text = shift; | ||||
190 | if ($] < 5.008) { | ||||
191 | $text =~ s/([^\0-\x7F])/do {my $o = ord($1); sprintf("%c%c", 0xc0 | ($o >> 6), 0x80 | ($o & 0x3f)) }/ge; | ||||
192 | } | ||||
193 | else { | ||||
194 | utf8::encode($text); | ||||
195 | } | ||||
196 | |||||
197 | return uri_escape($text, @_); | ||||
198 | } | ||||
199 | |||||
200 | sub uri_unescape | ||||
201 | # spent 19.3ms (17.0+2.35) within URI::Escape::uri_unescape which was called 461 times, avg 42µs/call:
# 461 times (17.0ms+2.35ms) by URI::_server::host at line 48 of URI/_server.pm, avg 42µs/call | ||||
202 | # Note from RFC1630: "Sequences which start with a percent sign | ||||
203 | # but are not followed by two hexadecimal characters are reserved | ||||
204 | # for future extension" | ||||
205 | 461 | 2.47ms | my $str = shift; | ||
206 | 461 | 2.06ms | if (@_ && wantarray) { | ||
207 | # not executed for the common case of a single argument | ||||
208 | my @str = ($str, @_); # need to copy | ||||
209 | foreach (@str) { | ||||
210 | s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; | ||||
211 | } | ||||
212 | return @str; | ||||
213 | } | ||||
214 | 461 | 8.41ms | 461 | 2.35ms | $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str; # spent 2.35ms making 461 calls to URI::Escape::CORE:subst, avg 5µs/call |
215 | 461 | 6.10ms | $str; | ||
216 | } | ||||
217 | |||||
218 | 1 | 31µs | 1; | ||
# spent 19µs within URI::Escape::CORE:match which was called
# once (19µs+0s) by URI::BEGIN at line 152 of URI/Escape.pm | |||||
# spent 8.43ms within URI::Escape::CORE:subst which was called 922 times, avg 9µs/call:
# 461 times (6.08ms+0s) by URI::Escape::uri_escape at line 177 of URI/Escape.pm, avg 13µs/call
# 461 times (2.35ms+0s) by URI::Escape::uri_unescape at line 214 of URI/Escape.pm, avg 5µs/call | |||||
# spent 24.7ms within URI::Escape::CORE:substcont which was called 3688 times, avg 7µs/call:
# 3688 times (24.7ms+0s) by URI::Escape::uri_escape at line 177 of URI/Escape.pm, avg 7µs/call |