Spaces:
Sleeping
Sleeping
File size: 8,144 Bytes
1d777c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 |
package Text::ParseWords;
use strict;
require 5.006;
our $VERSION = "3.30";
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
our @EXPORT_OK = qw(old_shellwords);
our $PERL_SINGLE_QUOTE;
sub shellwords {
my (@lines) = @_;
my @allwords;
foreach my $line (@lines) {
$line =~ s/^\s+//;
my @words = parse_line('\s+', 0, $line);
pop @words if (@words and !defined $words[-1]);
return() unless (@words || !length($line));
push(@allwords, @words);
}
return(@allwords);
}
sub quotewords {
my($delim, $keep, @lines) = @_;
my($line, @words, @allwords);
foreach $line (@lines) {
@words = parse_line($delim, $keep, $line);
return() unless (@words || !length($line));
push(@allwords, @words);
}
return(@allwords);
}
sub nested_quotewords {
my($delim, $keep, @lines) = @_;
my($i, @allwords);
for ($i = 0; $i < @lines; $i++) {
@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
return() unless (@{$allwords[$i]} || !length($lines[$i]));
}
return(@allwords);
}
sub parse_line {
my($delimiter, $keep, $line) = @_;
my($word, @pieces);
no warnings 'uninitialized'; # we will be testing undef strings
while (length($line)) {
# This pattern is optimised to be stack conservative on older perls.
# Do not refactor without being careful and testing it on very long strings.
# See Perl bug #42980 for an example of a stack busting input.
$line =~ s/^
(?:
# double quoted string
(") # $quote
((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted
| # --OR--
# singe quoted string
(') # $quote
((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted
| # --OR--
# unquoted string
( # $unquoted
(?:\\.|[^\\"'])*?
)
# followed by
( # $delim
\Z(?!\n) # EOL
| # --OR--
(?-x:$delimiter) # delimiter
| # --OR--
(?!^)(?=["']) # a quote
)
)//xs or return; # extended layout
my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
return() unless( defined($quote) || length($unquoted) || length($delim));
if ($keep) {
$quoted = "$quote$quoted$quote";
}
else {
$unquoted =~ s/\\(.)/$1/sg;
if (defined $quote) {
$quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
}
}
$word .= substr($line, 0, 0); # leave results tainted
$word .= defined $quote ? $quoted : $unquoted;
if (length($delim)) {
push(@pieces, $word);
push(@pieces, $delim) if ($keep eq 'delimiters');
undef $word;
}
if (!length($line)) {
push(@pieces, $word);
}
}
return(@pieces);
}
sub old_shellwords {
# Usage:
# use ParseWords;
# @words = old_shellwords($line);
# or
# @words = old_shellwords(@lines);
# or
# @words = old_shellwords(); # defaults to $_ (and clobbers it)
no warnings 'uninitialized'; # we will be testing undef strings
local *_ = \join('', @_) if @_;
my (@words, $snippet);
s/\A\s+//;
while ($_ ne '') {
my $field = substr($_, 0, 0); # leave results tainted
for (;;) {
if (s/\A"(([^"\\]|\\.)*)"//s) {
($snippet = $1) =~ s#\\(.)#$1#sg;
}
elsif (/\A"/) {
require Carp;
Carp::carp("Unmatched double quote: $_");
return();
}
elsif (s/\A'(([^'\\]|\\.)*)'//s) {
($snippet = $1) =~ s#\\(.)#$1#sg;
}
elsif (/\A'/) {
require Carp;
Carp::carp("Unmatched single quote: $_");
return();
}
elsif (s/\A\\(.?)//s) {
$snippet = $1;
}
elsif (s/\A([^\s\\'"]+)//) {
$snippet = $1;
}
else {
s/\A\s+//;
last;
}
$field .= $snippet;
}
push(@words, $field);
}
return @words;
}
1;
__END__
=head1 NAME
Text::ParseWords - parse text into an array of tokens or array of arrays
=head1 SYNOPSIS
use Text::ParseWords;
@lists = nested_quotewords($delim, $keep, @lines);
@words = quotewords($delim, $keep, @lines);
@words = shellwords(@lines);
@words = parse_line($delim, $keep, $line);
@words = old_shellwords(@lines); # DEPRECATED!
=head1 DESCRIPTION
The &nested_quotewords() and "ewords() functions accept a delimiter
(which can be a regular expression)
and a list of lines and then breaks those lines up into a list of
words ignoring delimiters that appear inside quotes. "ewords()
returns all of the tokens in a single long list, while &nested_quotewords()
returns a list of token lists corresponding to the elements of @lines.
&parse_line() does tokenizing on a single string. The &*quotewords()
functions simply call &parse_line(), so if you're only splitting
one line you can call &parse_line() directly and save a function
call.
The $keep argument is a boolean flag. If true, then the tokens are
split on the specified delimiter, but all other characters (including
quotes and backslashes) are kept in the tokens. If $keep is false then the
&*quotewords() functions remove all quotes and backslashes that are
not themselves backslash-escaped or inside of single quotes (i.e.,
"ewords() tries to interpret these characters just like the Bourne
shell). NB: these semantics are significantly different from the
original version of this module shipped with Perl 5.000 through 5.004.
As an additional feature, $keep may be the keyword "delimiters" which
causes the functions to preserve the delimiters in each string as
tokens in the token lists, in addition to preserving quote and
backslash characters.
&shellwords() is written as a special case of "ewords(), and it
does token parsing with whitespace as a delimiter-- similar to most
Unix shells.
=head1 EXAMPLES
The sample program:
use Text::ParseWords;
@words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
$i = 0;
foreach (@words) {
print "$i: <$_>\n";
$i++;
}
produces:
0: <this>
1: <is>
2: <a test>
3: <of quotewords>
4: <"for>
5: <you>
demonstrating:
=over 4
=item 0Z<>
a simple word
=item 1Z<>
multiple spaces are skipped because of our $delim
=item 2Z<>
use of quotes to include a space in a word
=item 3Z<>
use of a backslash to include a space in a word
=item 4Z<>
use of a backslash to remove the special meaning of a double-quote
=item 5Z<>
another simple word (note the lack of effect of the
backslashed double-quote)
=back
Replacing C<quotewords('\s+', 0, q{this is...})>
with C<shellwords(q{this is...})>
is a simpler way to accomplish the same thing.
=head1 SEE ALSO
L<Text::CSV> - for parsing CSV files
=head1 AUTHORS
Maintainer: Alexandr Ciornii <alexchornyATgmail.com>.
Previous maintainer: Hal Pomeranz <[email protected]>, 1994-1997 (Original
author unknown). Much of the code for &parse_line() (including the
primary regexp) from Joerk Behrends <[email protected]>.
Examples section another documentation provided by John Heidemann
<[email protected]>
Bug reports, patches, and nagging provided by lots of folks-- thanks
everybody! Special thanks to Michael Schwern <[email protected]>
for assuring me that a &nested_quotewords() would be useful, and to
Jeff Friedl <[email protected]> for telling me not to worry about
error-checking (sort of-- you had to be there).
=head1 COPYRIGHT AND LICENSE
This library is free software; you may redistribute and/or modify it
under the same terms as Perl itself.
=cut
|