Spaces:
Running
Running
# Copyright (C) 1997-2001 Damian Conway. All rights reserved. | |
# Copyright (C) 2009 Adam Kennedy. | |
# Copyright (C) 2015 Steve Hay. All rights reserved. | |
# This module is free software; you can redistribute it and/or modify it under | |
# the same terms as Perl itself, i.e. under the terms of either the GNU General | |
# Public License or the Artistic License, as specified in the F<LICENCE> file. | |
package Text::Balanced; | |
# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. | |
# FOR FULL DOCUMENTATION SEE Balanced.pod | |
use 5.008001; | |
use strict; | |
use Exporter (); | |
use vars qw { $VERSION @ISA %EXPORT_TAGS }; | |
BEGIN { | |
$VERSION = '2.04'; | |
@ISA = 'Exporter'; | |
%EXPORT_TAGS = ( | |
ALL => [ qw{ | |
&extract_delimited | |
&extract_bracketed | |
&extract_quotelike | |
&extract_codeblock | |
&extract_variable | |
&extract_tagged | |
&extract_multiple | |
&gen_delimited_pat | |
&gen_extract_tagged | |
&delimited_pat | |
} ], | |
); | |
} | |
Exporter::export_ok_tags('ALL'); | |
## no critic (Subroutines::ProhibitSubroutinePrototypes) | |
# PROTOTYPES | |
sub _match_bracketed($$$$$$); | |
sub _match_variable($$); | |
sub _match_codeblock($$$$$$$); | |
sub _match_quotelike($$$$); | |
# HANDLE RETURN VALUES IN VARIOUS CONTEXTS | |
sub _failmsg { | |
my ($message, $pos) = @_; | |
$@ = bless { | |
error => $message, | |
pos => $pos, | |
}, 'Text::Balanced::ErrorMsg'; | |
} | |
sub _fail { | |
my ($wantarray, $textref, $message, $pos) = @_; | |
_failmsg $message, $pos if $message; | |
return (undef, $$textref, undef) if $wantarray; | |
return; | |
} | |
sub _succeed { | |
$@ = undef; | |
my ($wantarray,$textref) = splice @_, 0, 2; | |
my ($extrapos, $extralen) = @_ > 18 | |
? splice(@_, -2, 2) | |
: (0, 0); | |
my ($startlen, $oppos) = @_[5,6]; | |
my $remainderpos = $_[2]; | |
if ( $wantarray ) { | |
my @res; | |
while (my ($from, $len) = splice @_, 0, 2) { | |
push @res, substr($$textref, $from, $len); | |
} | |
if ( $extralen ) { # CORRECT FILLET | |
my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n"); | |
$res[1] = "$extra$res[1]"; | |
eval { substr($$textref,$remainderpos,0) = $extra; | |
substr($$textref,$extrapos,$extralen,"\n")} ; | |
#REARRANGE HERE DOC AND FILLET IF POSSIBLE | |
pos($$textref) = $remainderpos-$extralen+1; # RESET \G | |
} else { | |
pos($$textref) = $remainderpos; # RESET \G | |
} | |
return @res; | |
} else { | |
my $match = substr($$textref,$_[0],$_[1]); | |
substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; | |
my $extra = $extralen | |
? substr($$textref, $extrapos, $extralen)."\n" : ""; | |
eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE | |
pos($$textref) = $_[4]; # RESET \G | |
return $match; | |
} | |
} | |
# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING | |
sub gen_delimited_pat($;$) # ($delimiters;$escapes) | |
{ | |
my ($dels, $escs) = @_; | |
return "" unless $dels =~ /\S/; | |
$escs = '\\' unless $escs; | |
$escs .= substr($escs,-1) x (length($dels)-length($escs)); | |
my @pat = (); | |
my $i; | |
for ($i=0; $i<length $dels; $i++) | |
{ | |
my $del = quotemeta substr($dels,$i,1); | |
my $esc = quotemeta substr($escs,$i,1); | |
if ($del eq $esc) | |
{ | |
push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; | |
} | |
else | |
{ | |
push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; | |
} | |
} | |
my $pat = join '|', @pat; | |
return "(?:$pat)"; | |
} | |
*delimited_pat = \&gen_delimited_pat; | |
# THE EXTRACTION FUNCTIONS | |
sub extract_delimited (;$$$$) | |
{ | |
my $textref = defined $_[0] ? \$_[0] : \$_; | |
my $wantarray = wantarray; | |
my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; | |
my $pre = defined $_[2] ? $_[2] : '\s*'; | |
my $esc = defined $_[3] ? $_[3] : qq{\\}; | |
my $pat = gen_delimited_pat($del, $esc); | |
my $startpos = pos $$textref || 0; | |
return _fail($wantarray, $textref, "Not a delimited pattern", 0) | |
unless $$textref =~ m/\G($pre)($pat)/gc; | |
my $prelen = length($1); | |
my $matchpos = $startpos+$prelen; | |
my $endpos = pos $$textref; | |
return _succeed $wantarray, $textref, | |
$matchpos, $endpos-$matchpos, # MATCH | |
$endpos, length($$textref)-$endpos, # REMAINDER | |
$startpos, $prelen; # PREFIX | |
} | |
sub extract_bracketed (;$$$) | |
{ | |
my $textref = defined $_[0] ? \$_[0] : \$_; | |
my $ldel = defined $_[1] ? $_[1] : '{([<'; | |
my $pre = defined $_[2] ? $_[2] : '\s*'; | |
my $wantarray = wantarray; | |
my $qdel = ""; | |
my $quotelike; | |
$ldel =~ s/'//g and $qdel .= q{'}; | |
$ldel =~ s/"//g and $qdel .= q{"}; | |
$ldel =~ s/`//g and $qdel .= q{`}; | |
$ldel =~ s/q//g and $quotelike = 1; | |
$ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; | |
my $rdel = $ldel; | |
unless ($rdel =~ tr/[({</])}>/) | |
{ | |
return _fail $wantarray, $textref, | |
"Did not find a suitable bracket in delimiter: \"$_[1]\"", | |
0; | |
} | |
my $posbug = pos; | |
$ldel = join('|', map { quotemeta $_ } split('', $ldel)); | |
$rdel = join('|', map { quotemeta $_ } split('', $rdel)); | |
pos = $posbug; | |
my $startpos = pos $$textref || 0; | |
my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); | |
return _fail ($wantarray, $textref) unless @match; | |
return _succeed ( $wantarray, $textref, | |
$match[2], $match[5]+2, # MATCH | |
@match[8,9], # REMAINDER | |
@match[0,1], # PREFIX | |
); | |
} | |
sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel | |
{ | |
my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; | |
my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); | |
unless ($$textref =~ m/\G$pre/gc) | |
{ | |
_failmsg "Did not find prefix: /$pre/", $startpos; | |
return; | |
} | |
$ldelpos = pos $$textref; | |
unless ($$textref =~ m/\G($ldel)/gc) | |
{ | |
_failmsg "Did not find opening bracket after prefix: \"$pre\"", | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
my @nesting = ( $1 ); | |
my $textlen = length $$textref; | |
while (pos $$textref < $textlen) | |
{ | |
next if $$textref =~ m/\G\\./gcs; | |
if ($$textref =~ m/\G($ldel)/gc) | |
{ | |
push @nesting, $1; | |
} | |
elsif ($$textref =~ m/\G($rdel)/gc) | |
{ | |
my ($found, $brackettype) = ($1, $1); | |
if ($#nesting < 0) | |
{ | |
_failmsg "Unmatched closing bracket: \"$found\"", | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
my $expected = pop(@nesting); | |
$expected =~ tr/({[</)}]>/; | |
if ($expected ne $brackettype) | |
{ | |
_failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
last if $#nesting < 0; | |
} | |
elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) | |
{ | |
$$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; | |
_failmsg "Unmatched embedded quote ($1)", | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
elsif ($quotelike && _match_quotelike($textref,"",1,0)) | |
{ | |
next; | |
} | |
else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } | |
} | |
if ($#nesting>=0) | |
{ | |
_failmsg "Unmatched opening bracket(s): " | |
. join("..",@nesting)."..", | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
$endpos = pos $$textref; | |
return ( | |
$startpos, $ldelpos-$startpos, # PREFIX | |
$ldelpos, 1, # OPENING BRACKET | |
$ldelpos+1, $endpos-$ldelpos-2, # CONTENTS | |
$endpos-1, 1, # CLOSING BRACKET | |
$endpos, length($$textref)-$endpos, # REMAINDER | |
); | |
} | |
sub _revbracket($) | |
{ | |
my $brack = reverse $_[0]; | |
$brack =~ tr/[({</])}>/; | |
return $brack; | |
} | |
my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; | |
sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) | |
{ | |
my $textref = defined $_[0] ? \$_[0] : \$_; | |
my $ldel = $_[1]; | |
my $rdel = $_[2]; | |
my $pre = defined $_[3] ? $_[3] : '\s*'; | |
my %options = defined $_[4] ? %{$_[4]} : (); | |
my $omode = defined $options{fail} ? $options{fail} : ''; | |
my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) | |
: defined($options{reject}) ? $options{reject} | |
: '' | |
; | |
my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) | |
: defined($options{ignore}) ? $options{ignore} | |
: '' | |
; | |
if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } | |
$@ = undef; | |
my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); | |
return _fail(wantarray, $textref) unless @match; | |
return _succeed wantarray, $textref, | |
$match[2], $match[3]+$match[5]+$match[7], # MATCH | |
@match[8..9,0..1,2..7]; # REM, PRE, BITS | |
} | |
sub _match_tagged # ($$$$$$$) | |
{ | |
my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; | |
my $rdelspec; | |
my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); | |
unless ($$textref =~ m/\G($pre)/gc) | |
{ | |
_failmsg "Did not find prefix: /$pre/", pos $$textref; | |
goto failed; | |
} | |
$opentagpos = pos($$textref); | |
unless ($$textref =~ m/\G$ldel/gc) | |
{ | |
_failmsg "Did not find opening tag: /$ldel/", pos $$textref; | |
goto failed; | |
} | |
$textpos = pos($$textref); | |
if (!defined $rdel) | |
{ | |
$rdelspec = substr($$textref, $-[0], $+[0] - $-[0]); | |
unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes) | |
{ | |
_failmsg "Unable to construct closing tag to match: $rdel", | |
pos $$textref; | |
goto failed; | |
} | |
} | |
else | |
{ | |
## no critic (BuiltinFunctions::ProhibitStringyEval) | |
$rdelspec = eval "qq{$rdel}" || do { | |
my $del; | |
for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) | |
{ next if $rdel =~ /\Q$_/; $del = $_; last } | |
unless ($del) { | |
use Carp; | |
croak "Can't interpolate right delimiter $rdel" | |
} | |
eval "qq$del$rdel$del"; | |
}; | |
} | |
while (pos($$textref) < length($$textref)) | |
{ | |
next if $$textref =~ m/\G\\./gc; | |
if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) | |
{ | |
$parapos = pos($$textref) - length($1) | |
unless defined $parapos; | |
} | |
elsif ($$textref =~ m/\G($rdelspec)/gc ) | |
{ | |
$closetagpos = pos($$textref)-length($1); | |
goto matched; | |
} | |
elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) | |
{ | |
next; | |
} | |
elsif ($bad && $$textref =~ m/\G($bad)/gcs) | |
{ | |
pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS | |
goto short if ($omode eq 'PARA' || $omode eq 'MAX'); | |
_failmsg "Found invalid nested tag: $1", pos $$textref; | |
goto failed; | |
} | |
elsif ($$textref =~ m/\G($ldel)/gc) | |
{ | |
my $tag = $1; | |
pos($$textref) -= length($tag); # REWIND TO NESTED TAG | |
unless (_match_tagged(@_)) # MATCH NESTED TAG | |
{ | |
goto short if $omode eq 'PARA' || $omode eq 'MAX'; | |
_failmsg "Found unbalanced nested tag: $tag", | |
pos $$textref; | |
goto failed; | |
} | |
} | |
else { $$textref =~ m/./gcs } | |
} | |
short: | |
$closetagpos = pos($$textref); | |
goto matched if $omode eq 'MAX'; | |
goto failed unless $omode eq 'PARA'; | |
if (defined $parapos) { pos($$textref) = $parapos } | |
else { $parapos = pos($$textref) } | |
return ( | |
$startpos, $opentagpos-$startpos, # PREFIX | |
$opentagpos, $textpos-$opentagpos, # OPENING TAG | |
$textpos, $parapos-$textpos, # TEXT | |
$parapos, 0, # NO CLOSING TAG | |
$parapos, length($$textref)-$parapos, # REMAINDER | |
); | |
matched: | |
$endpos = pos($$textref); | |
return ( | |
$startpos, $opentagpos-$startpos, # PREFIX | |
$opentagpos, $textpos-$opentagpos, # OPENING TAG | |
$textpos, $closetagpos-$textpos, # TEXT | |
$closetagpos, $endpos-$closetagpos, # CLOSING TAG | |
$endpos, length($$textref)-$endpos, # REMAINDER | |
); | |
failed: | |
_failmsg "Did not find closing tag", pos $$textref unless $@; | |
pos($$textref) = $startpos; | |
return; | |
} | |
sub extract_variable (;$$) | |
{ | |
my $textref = defined $_[0] ? \$_[0] : \$_; | |
return ("","","") unless defined $$textref; | |
my $pre = defined $_[1] ? $_[1] : '\s*'; | |
my @match = _match_variable($textref,$pre); | |
return _fail wantarray, $textref unless @match; | |
return _succeed wantarray, $textref, | |
@match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX | |
} | |
sub _match_variable($$) | |
{ | |
# $# | |
# $^ | |
# $$ | |
my ($textref, $pre) = @_; | |
my $startpos = pos($$textref) = pos($$textref)||0; | |
unless ($$textref =~ m/\G($pre)/gc) | |
{ | |
_failmsg "Did not find prefix: /$pre/", pos $$textref; | |
return; | |
} | |
my $varpos = pos($$textref); | |
unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) | |
{ | |
unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) | |
{ | |
_failmsg "Did not find leading dereferencer", pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
my $deref = $1; | |
unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci | |
or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0) | |
or $deref eq '$#' or $deref eq '$$' ) | |
{ | |
_failmsg "Bad identifier after dereferencer", pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
} | |
while (1) | |
{ | |
next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; | |
next if _match_codeblock($textref, | |
qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, | |
qr/[({[]/, qr/[)}\]]/, | |
qr/[({[]/, qr/[)}\]]/, 0); | |
next if _match_codeblock($textref, | |
qr/\s*/, qr/[{[]/, qr/[}\]]/, | |
qr/[{[]/, qr/[}\]]/, 0); | |
next if _match_variable($textref,'\s*->\s*'); | |
next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; | |
last; | |
} | |
my $endpos = pos($$textref); | |
return ($startpos, $varpos-$startpos, | |
$varpos, $endpos-$varpos, | |
$endpos, length($$textref)-$endpos | |
); | |
} | |
sub extract_codeblock (;$$$$$) | |
{ | |
my $textref = defined $_[0] ? \$_[0] : \$_; | |
my $wantarray = wantarray; | |
my $ldel_inner = defined $_[1] ? $_[1] : '{'; | |
my $pre = defined $_[2] ? $_[2] : '\s*'; | |
my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; | |
my $rd = $_[4]; | |
my $rdel_inner = $ldel_inner; | |
my $rdel_outer = $ldel_outer; | |
my $posbug = pos; | |
for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } | |
for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } | |
for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) | |
{ | |
$_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' | |
} | |
pos = $posbug; | |
my @match = _match_codeblock($textref, $pre, | |
$ldel_outer, $rdel_outer, | |
$ldel_inner, $rdel_inner, | |
$rd); | |
return _fail($wantarray, $textref) unless @match; | |
return _succeed($wantarray, $textref, | |
@match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX | |
); | |
} | |
sub _match_codeblock($$$$$$$) | |
{ | |
my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; | |
my $startpos = pos($$textref) = pos($$textref) || 0; | |
unless ($$textref =~ m/\G($pre)/gc) | |
{ | |
_failmsg qq{Did not match prefix /$pre/ at"} . | |
substr($$textref,pos($$textref),20) . | |
q{..."}, | |
pos $$textref; | |
return; | |
} | |
my $codepos = pos($$textref); | |
unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER | |
{ | |
_failmsg qq{Did not find expected opening bracket at "} . | |
substr($$textref,pos($$textref),20) . | |
q{..."}, | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
my $closing = $1; | |
$closing =~ tr/([<{/)]>}/; | |
my $matched; | |
my $patvalid = 1; | |
while (pos($$textref) < length($$textref)) | |
{ | |
$matched = ''; | |
if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) | |
{ | |
$patvalid = 0; | |
next; | |
} | |
if ($$textref =~ m/\G\s*#.*/gc) | |
{ | |
next; | |
} | |
if ($$textref =~ m/\G\s*($rdel_outer)/gc) | |
{ | |
unless ($matched = ($closing && $1 eq $closing) ) | |
{ | |
next if $1 eq '>'; # MIGHT BE A "LESS THAN" | |
_failmsg q{Mismatched closing bracket at "} . | |
substr($$textref,pos($$textref),20) . | |
qq{...". Expected '$closing'}, | |
pos $$textref; | |
} | |
last; | |
} | |
if (_match_variable($textref,'\s*') || | |
_match_quotelike($textref,'\s*',$patvalid,$patvalid) ) | |
{ | |
$patvalid = 0; | |
next; | |
} | |
# NEED TO COVER MANY MORE CASES HERE!!! | |
if ($$textref =~ m#\G\s*(?!$ldel_inner) | |
( [-+*x/%^&|.]=? | |
| [!=]~ | |
| =(?!>) | |
| (\*\*|&&|\|\||<<|>>)=? | |
| split|grep|map|return | |
| [([] | |
)#gcx) | |
{ | |
$patvalid = 1; | |
next; | |
} | |
if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) | |
{ | |
$patvalid = 1; | |
next; | |
} | |
if ($$textref =~ m/\G\s*$ldel_outer/gc) | |
{ | |
_failmsg q{Improperly nested codeblock at "} . | |
substr($$textref,pos($$textref),20) . | |
q{..."}, | |
pos $$textref; | |
last; | |
} | |
$patvalid = 0; | |
$$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; | |
} | |
continue { $@ = undef } | |
unless ($matched) | |
{ | |
_failmsg 'No match found for opening bracket', pos $$textref | |
unless $@; | |
return; | |
} | |
my $endpos = pos($$textref); | |
return ( $startpos, $codepos-$startpos, | |
$codepos, $endpos-$codepos, | |
$endpos, length($$textref)-$endpos, | |
); | |
} | |
my %mods = ( | |
'none' => '[cgimsox]*', | |
'm' => '[cgimsox]*', | |
's' => '[cegimsox]*', | |
'tr' => '[cds]*', | |
'y' => '[cds]*', | |
'qq' => '', | |
'qx' => '', | |
'qw' => '', | |
'qr' => '[imsx]*', | |
'q' => '', | |
); | |
sub extract_quotelike (;$$) | |
{ | |
my $textref = $_[0] ? \$_[0] : \$_; | |
my $wantarray = wantarray; | |
my $pre = defined $_[1] ? $_[1] : '\s*'; | |
my @match = _match_quotelike($textref,$pre,1,0); | |
return _fail($wantarray, $textref) unless @match; | |
return _succeed($wantarray, $textref, | |
$match[2], $match[18]-$match[2], # MATCH | |
@match[18,19], # REMAINDER | |
@match[0,1], # PREFIX | |
@match[2..17], # THE BITS | |
@match[20,21], # ANY FILLET? | |
); | |
}; | |
sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) | |
{ | |
my ($textref, $pre, $rawmatch, $qmark) = @_; | |
my ($textlen,$startpos, | |
$oppos, | |
$preld1pos,$ld1pos,$str1pos,$rd1pos, | |
$preld2pos,$ld2pos,$str2pos,$rd2pos, | |
$modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); | |
unless ($$textref =~ m/\G($pre)/gc) | |
{ | |
_failmsg qq{Did not find prefix /$pre/ at "} . | |
substr($$textref, pos($$textref), 20) . | |
q{..."}, | |
pos $$textref; | |
return; | |
} | |
$oppos = pos($$textref); | |
my $initial = substr($$textref,$oppos,1); | |
if ($initial && $initial =~ m|^[\"\'\`]| | |
|| $rawmatch && $initial =~ m|^/| | |
|| $qmark && $initial =~ m|^\?|) | |
{ | |
unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) | |
{ | |
_failmsg qq{Did not find closing delimiter to match '$initial' at "} . | |
substr($$textref, $oppos, 20) . | |
q{..."}, | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
$modpos= pos($$textref); | |
$rd1pos = $modpos-1; | |
if ($initial eq '/' || $initial eq '?') | |
{ | |
$$textref =~ m/\G$mods{none}/gc | |
} | |
my $endpos = pos($$textref); | |
return ( | |
$startpos, $oppos-$startpos, # PREFIX | |
$oppos, 0, # NO OPERATOR | |
$oppos, 1, # LEFT DEL | |
$oppos+1, $rd1pos-$oppos-1, # STR/PAT | |
$rd1pos, 1, # RIGHT DEL | |
$modpos, 0, # NO 2ND LDEL | |
$modpos, 0, # NO 2ND STR | |
$modpos, 0, # NO 2ND RDEL | |
$modpos, $endpos-$modpos, # MODIFIERS | |
$endpos, $textlen-$endpos, # REMAINDER | |
); | |
} | |
unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) | |
{ | |
_failmsg q{No quotelike operator found after prefix at "} . | |
substr($$textref, pos($$textref), 20) . | |
q{..."}, | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
my $op = $1; | |
$preld1pos = pos($$textref); | |
if ($op eq '<<') { | |
$ld1pos = pos($$textref); | |
my $label; | |
if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { | |
$label = $1; | |
} | |
elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' | |
| \G " ([^"\\]* (?:\\.[^"\\]*)*) " | |
| \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` | |
}gcsx) { | |
$label = $+; | |
} | |
else { | |
$label = ""; | |
} | |
my $extrapos = pos($$textref); | |
$$textref =~ m{.*\n}gc; | |
$str1pos = pos($$textref)--; | |
unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) { | |
_failmsg qq{Missing here doc terminator ('$label') after "} . | |
substr($$textref, $startpos, 20) . | |
q{..."}, | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
$rd1pos = pos($$textref); | |
$$textref =~ m{\Q$label\E\n}gc; | |
$ld2pos = pos($$textref); | |
return ( | |
$startpos, $oppos-$startpos, # PREFIX | |
$oppos, length($op), # OPERATOR | |
$ld1pos, $extrapos-$ld1pos, # LEFT DEL | |
$str1pos, $rd1pos-$str1pos, # STR/PAT | |
$rd1pos, $ld2pos-$rd1pos, # RIGHT DEL | |
$ld2pos, 0, # NO 2ND LDEL | |
$ld2pos, 0, # NO 2ND STR | |
$ld2pos, 0, # NO 2ND RDEL | |
$ld2pos, 0, # NO MODIFIERS | |
$ld2pos, $textlen-$ld2pos, # REMAINDER | |
$extrapos, $str1pos-$extrapos, # FILLETED BIT | |
); | |
} | |
$$textref =~ m/\G\s*/gc; | |
$ld1pos = pos($$textref); | |
$str1pos = $ld1pos+1; | |
unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD | |
{ | |
_failmsg "No block delimiter found after quotelike $op", | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN | |
my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); | |
if ($ldel1 =~ /[[(<{]/) | |
{ | |
$rdel1 =~ tr/[({</])}>/; | |
defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1)) | |
|| do { pos $$textref = $startpos; return }; | |
$ld2pos = pos($$textref); | |
$rd1pos = $ld2pos-1; | |
} | |
else | |
{ | |
$$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs | |
|| do { pos $$textref = $startpos; return }; | |
$ld2pos = $rd1pos = pos($$textref)-1; | |
} | |
my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; | |
if ($second_arg) | |
{ | |
my ($ldel2, $rdel2); | |
if ($ldel1 =~ /[[(<{]/) | |
{ | |
unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD | |
{ | |
_failmsg "Missing second block for quotelike $op", | |
pos $$textref; | |
pos $$textref = $startpos; | |
return; | |
} | |
$ldel2 = $rdel2 = "\Q$1"; | |
$rdel2 =~ tr/[({</])}>/; | |
} | |
else | |
{ | |
$ldel2 = $rdel2 = $ldel1; | |
} | |
$str2pos = $ld2pos+1; | |
if ($ldel2 =~ /[[(<{]/) | |
{ | |
pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD | |
defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2)) | |
|| do { pos $$textref = $startpos; return }; | |
} | |
else | |
{ | |
$$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs | |
|| do { pos $$textref = $startpos; return }; | |
} | |
$rd2pos = pos($$textref)-1; | |
} | |
else | |
{ | |
$ld2pos = $str2pos = $rd2pos = $rd1pos; | |
} | |
$modpos = pos $$textref; | |
$$textref =~ m/\G($mods{$op})/gc; | |
my $endpos = pos $$textref; | |
return ( | |
$startpos, $oppos-$startpos, # PREFIX | |
$oppos, length($op), # OPERATOR | |
$ld1pos, 1, # LEFT DEL | |
$str1pos, $rd1pos-$str1pos, # STR/PAT | |
$rd1pos, 1, # RIGHT DEL | |
$ld2pos, $second_arg, # 2ND LDEL (MAYBE) | |
$str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) | |
$rd2pos, $second_arg, # 2ND RDEL (MAYBE) | |
$modpos, $endpos-$modpos, # MODIFIERS | |
$endpos, $textlen-$endpos, # REMAINDER | |
); | |
} | |
my $def_func = [ | |
sub { extract_variable($_[0], '') }, | |
sub { extract_quotelike($_[0],'') }, | |
sub { extract_codeblock($_[0],'{}','') }, | |
]; | |
sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) | |
{ | |
my $textref = defined($_[0]) ? \$_[0] : \$_; | |
my $posbug = pos; | |
my ($lastpos, $firstpos); | |
my @fields = (); | |
#for ($$textref) | |
{ | |
my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; | |
my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; | |
my $igunk = $_[3]; | |
pos $$textref ||= 0; | |
unless (wantarray) | |
{ | |
use Carp; | |
carp "extract_multiple reset maximal count to 1 in scalar context" | |
if $^W && defined($_[2]) && $max > 1; | |
$max = 1 | |
} | |
my $unkpos; | |
my $class; | |
my @class; | |
foreach my $func ( @func ) | |
{ | |
if (ref($func) eq 'HASH') | |
{ | |
push @class, (keys %$func)[0]; | |
$func = (values %$func)[0]; | |
} | |
else | |
{ | |
push @class, undef; | |
} | |
} | |
FIELD: while (pos($$textref) < length($$textref)) | |
{ | |
my ($field, $rem); | |
my @bits; | |
foreach my $i ( 0..$#func ) | |
{ | |
my $pref; | |
my $func = $func[$i]; | |
$class = $class[$i]; | |
$lastpos = pos $$textref; | |
if (ref($func) eq 'CODE') | |
{ ($field,$rem,$pref) = @bits = $func->($$textref) } | |
elsif (ref($func) eq 'Text::Balanced::Extractor') | |
{ @bits = $field = $func->extract($$textref) } | |
elsif( $$textref =~ m/\G$func/gc ) | |
{ @bits = $field = defined($1) | |
? $1 | |
: substr($$textref, $-[0], $+[0] - $-[0]) | |
} | |
$pref ||= ""; | |
if (defined($field) && length($field)) | |
{ | |
if (!$igunk) { | |
$unkpos = $lastpos | |
if length($pref) && !defined($unkpos); | |
if (defined $unkpos) | |
{ | |
push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; | |
$firstpos = $unkpos unless defined $firstpos; | |
undef $unkpos; | |
last FIELD if @fields == $max; | |
} | |
} | |
push @fields, $class | |
? bless (\$field, $class) | |
: $field; | |
$firstpos = $lastpos unless defined $firstpos; | |
$lastpos = pos $$textref; | |
last FIELD if @fields == $max; | |
next FIELD; | |
} | |
} | |
if ($$textref =~ /\G(.)/gcs) | |
{ | |
$unkpos = pos($$textref)-1 | |
unless $igunk || defined $unkpos; | |
} | |
} | |
if (defined $unkpos) | |
{ | |
push @fields, substr($$textref, $unkpos); | |
$firstpos = $unkpos unless defined $firstpos; | |
$lastpos = length $$textref; | |
} | |
last; | |
} | |
pos $$textref = $lastpos; | |
return @fields if wantarray; | |
$firstpos ||= 0; | |
eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; | |
pos $$textref = $firstpos }; | |
return $fields[0]; | |
} | |
sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) | |
{ | |
my $ldel = $_[0]; | |
my $rdel = $_[1]; | |
my $pre = defined $_[2] ? $_[2] : '\s*'; | |
my %options = defined $_[3] ? %{$_[3]} : (); | |
my $omode = defined $options{fail} ? $options{fail} : ''; | |
my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) | |
: defined($options{reject}) ? $options{reject} | |
: '' | |
; | |
my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) | |
: defined($options{ignore}) ? $options{ignore} | |
: '' | |
; | |
if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } | |
my $posbug = pos; | |
for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } | |
pos = $posbug; | |
my $closure = sub | |
{ | |
my $textref = defined $_[0] ? \$_[0] : \$_; | |
my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); | |
return _fail(wantarray, $textref) unless @match; | |
return _succeed wantarray, $textref, | |
$match[2], $match[3]+$match[5]+$match[7], # MATCH | |
@match[8..9,0..1,2..7]; # REM, PRE, BITS | |
}; | |
bless $closure, 'Text::Balanced::Extractor'; | |
} | |
package Text::Balanced::Extractor; | |
sub extract($$) # ($self, $text) | |
{ | |
&{$_[0]}($_[1]); | |
} | |
package Text::Balanced::ErrorMsg; | |
use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; | |
1; | |
__END__ | |
=pod | |
=head1 NAME | |
Text::Balanced - Extract delimited text sequences from strings. | |
=head1 SYNOPSIS | |
use Text::Balanced qw ( | |
extract_delimited | |
extract_bracketed | |
extract_quotelike | |
extract_codeblock | |
extract_variable | |
extract_tagged | |
extract_multiple | |
gen_delimited_pat | |
gen_extract_tagged | |
); | |
# Extract the initial substring of $text that is delimited by | |
# two (unescaped) instances of the first character in $delim. | |
($extracted, $remainder) = extract_delimited($text,$delim); | |
# Extract the initial substring of $text that is bracketed | |
# with a delimiter(s) specified by $delim (where the string | |
# in $delim contains one or more of '(){}[]<>'). | |
($extracted, $remainder) = extract_bracketed($text,$delim); | |
# Extract the initial substring of $text that is bounded by | |
# an XML tag. | |
($extracted, $remainder) = extract_tagged($text); | |
# Extract the initial substring of $text that is bounded by | |
# a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags | |
($extracted, $remainder) = | |
extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); | |
# Extract the initial substring of $text that represents a | |
# Perl "quote or quote-like operation" | |
($extracted, $remainder) = extract_quotelike($text); | |
# Extract the initial substring of $text that represents a block | |
# of Perl code, bracketed by any of character(s) specified by $delim | |
# (where the string $delim contains one or more of '(){}[]<>'). | |
($extracted, $remainder) = extract_codeblock($text,$delim); | |
# Extract the initial substrings of $text that would be extracted by | |
# one or more sequential applications of the specified functions | |
# or regular expressions | |
@extracted = extract_multiple($text, | |
[ \&extract_bracketed, | |
\&extract_quotelike, | |
\&some_other_extractor_sub, | |
qr/[xyz]*/, | |
'literal', | |
]); | |
# Create a string representing an optimized pattern (a la Friedl) | |
# that matches a substring delimited by any of the specified characters | |
# (in this case: any type of quote or a slash) | |
$patstring = gen_delimited_pat(q{'"`/}); | |
# Generate a reference to an anonymous sub that is just like extract_tagged | |
# but pre-compiled and optimized for a specific pair of tags, and | |
# consequently much faster (i.e. 3 times faster). It uses qr// for better | |
# performance on repeated calls. | |
$extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); | |
($extracted, $remainder) = $extract_head->($text); | |
=head1 DESCRIPTION | |
The various C<extract_...> subroutines may be used to | |
extract a delimited substring, possibly after skipping a | |
specified prefix string. By default, that prefix is | |
optional whitespace (C</\s*/>), but you can change it to whatever | |
you wish (see below). | |
The substring to be extracted must appear at the | |
current C<pos> location of the string's variable | |
(or at index zero, if no C<pos> position is defined). | |
In other words, the C<extract_...> subroutines I<don't> | |
extract the first occurrence of a substring anywhere | |
in a string (like an unanchored regex would). Rather, | |
they extract an occurrence of the substring appearing | |
immediately at the current matching position in the | |
string (like a C<\G>-anchored regex would). | |
=head2 General Behaviour in List Contexts | |
In a list context, all the subroutines return a list, the first three | |
elements of which are always: | |
=over 4 | |
=item [0] | |
The extracted string, including the specified delimiters. | |
If the extraction fails C<undef> is returned. | |
=item [1] | |
The remainder of the input string (i.e. the characters after the | |
extracted string). On failure, the entire string is returned. | |
=item [2] | |
The skipped prefix (i.e. the characters before the extracted string). | |
On failure, C<undef> is returned. | |
=back | |
Note that in a list context, the contents of the original input text (the first | |
argument) are not modified in any way. | |
However, if the input text was passed in a variable, that variable's | |
C<pos> value is updated to point at the first character after the | |
extracted text. That means that in a list context the various | |
subroutines can be used much like regular expressions. For example: | |
while ( $next = (extract_quotelike($text))[0] ) | |
{ | |
# process next quote-like (in $next) | |
} | |
=head2 General Behaviour in Scalar and Void Contexts | |
In a scalar context, the extracted string is returned, having first been | |
removed from the input text. Thus, the following code also processes | |
each quote-like operation, but actually removes them from $text: | |
while ( $next = extract_quotelike($text) ) | |
{ | |
# process next quote-like (in $next) | |
} | |
Note that if the input text is a read-only string (i.e. a literal), | |
no attempt is made to remove the extracted text. | |
In a void context the behaviour of the extraction subroutines is | |
exactly the same as in a scalar context, except (of course) that the | |
extracted substring is not returned. | |
=head2 A Note About Prefixes | |
Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.) | |
This can bite you if you're expecting a prefix specification like | |
'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix | |
pattern will only succeed if the <H1> tag is on the current line, since | |
. normally doesn't match newlines. | |
To overcome this limitation, you need to turn on /s matching within | |
the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' | |
=head2 Functions | |
=over 4 | |
=item C<extract_delimited> | |
The C<extract_delimited> function formalizes the common idiom | |
of extracting a single-character-delimited substring from the start of | |
a string. For example, to extract a single-quote delimited string, the | |
following code is typically used: | |
($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; | |
$extracted = $1; | |
but with C<extract_delimited> it can be simplified to: | |
($extracted,$remainder) = extract_delimited($text, "'"); | |
C<extract_delimited> takes up to four scalars (the input text, the | |
delimiters, a prefix pattern to be skipped, and any escape characters) | |
and extracts the initial substring of the text that | |
is appropriately delimited. If the delimiter string has multiple | |
characters, the first one encountered in the text is taken to delimit | |
the substring. | |
The third argument specifies a prefix pattern that is to be skipped | |
(but must be present!) before the substring is extracted. | |
The final argument specifies the escape character to be used for each | |
delimiter. | |
All arguments are optional. If the escape characters are not specified, | |
every delimiter is escaped with a backslash (C<\>). | |
If the prefix is not specified, the | |
pattern C<'\s*'> - optional whitespace - is used. If the delimiter set | |
is also not specified, the set C</["'`]/> is used. If the text to be processed | |
is not specified either, C<$_> is used. | |
In list context, C<extract_delimited> returns a array of three | |
elements, the extracted substring (I<including the surrounding | |
delimiters>), the remainder of the text, and the skipped prefix (if | |
any). If a suitable delimited substring is not found, the first | |
element of the array is the empty string, the second is the complete | |
original text, and the prefix returned in the third element is an | |
empty string. | |
In a scalar context, just the extracted substring is returned. In | |
a void context, the extracted substring (and any prefix) are simply | |
removed from the beginning of the first argument. | |
Examples: | |
# Remove a single-quoted substring from the very beginning of $text: | |
$substring = extract_delimited($text, "'", ''); | |
# Remove a single-quoted Pascalish substring (i.e. one in which | |
# doubling the quote character escapes it) from the very | |
# beginning of $text: | |
$substring = extract_delimited($text, "'", '', "'"); | |
# Extract a single- or double- quoted substring from the | |
# beginning of $text, optionally after some whitespace | |
# (note the list context to protect $text from modification): | |
($substring) = extract_delimited $text, q{"'}; | |
# Delete the substring delimited by the first '/' in $text: | |
$text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; | |
Note that this last example is I<not> the same as deleting the first | |
quote-like pattern. For instance, if C<$text> contained the string: | |
"if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" | |
then after the deletion it would contain: | |
"if ('.$UNIXCMD/s) { $cmd = $1; }" | |
not: | |
"if ('./cmd' =~ ms) { $cmd = $1; }" | |
See L<"extract_quotelike"> for a (partial) solution to this problem. | |
=item C<extract_bracketed> | |
Like C<"extract_delimited">, the C<extract_bracketed> function takes | |
up to three optional scalar arguments: a string to extract from, a delimiter | |
specifier, and a prefix pattern. As before, a missing prefix defaults to | |
optional whitespace and a missing text defaults to C<$_>. However, a missing | |
delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). | |
C<extract_bracketed> extracts a balanced-bracket-delimited | |
substring (using any one (or more) of the user-specified delimiter | |
brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also | |
respect quoted unbalanced brackets (see below). | |
A "delimiter bracket" is a bracket in list of delimiters passed as | |
C<extract_bracketed>'s second argument. Delimiter brackets are | |
specified by giving either the left or right (or both!) versions | |
of the required bracket(s). Note that the order in which | |
two or more delimiter brackets are specified is not significant. | |
A "balanced-bracket-delimited substring" is a substring bounded by | |
matched brackets, such that any other (left or right) delimiter | |
bracket I<within> the substring is also matched by an opposite | |
(right or left) delimiter bracket I<at the same level of nesting>. Any | |
type of bracket not in the delimiter list is treated as an ordinary | |
character. | |
In other words, each type of bracket specified as a delimiter must be | |
balanced and correctly nested within the substring, and any other kind of | |
("non-delimiter") bracket in the substring is ignored. | |
For example, given the string: | |
$text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; | |
then a call to C<extract_bracketed> in a list context: | |
@result = extract_bracketed( $text, '{}' ); | |
would return: | |
( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) | |
since both sets of C<'{..}'> brackets are properly nested and evenly balanced. | |
(In a scalar context just the first element of the array would be returned. In | |
a void context, C<$text> would be replaced by an empty string.) | |
Likewise the call in: | |
@result = extract_bracketed( $text, '{[' ); | |
would return the same result, since all sets of both types of specified | |
delimiter brackets are correctly nested and balanced. | |
However, the call in: | |
@result = extract_bracketed( $text, '{([<' ); | |
would fail, returning: | |
( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); | |
because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and | |
the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would | |
return an empty string. In a void context, C<$text> would be unchanged.) | |
Note that the embedded single-quotes in the string don't help in this | |
case, since they have not been specified as acceptable delimiters and are | |
therefore treated as non-delimiter characters (and ignored). | |
However, if a particular species of quote character is included in the | |
delimiter specification, then that type of quote will be correctly handled. | |
for example, if C<$text> is: | |
$text = '<A HREF=">>>>">link</A>'; | |
then | |
@result = extract_bracketed( $text, '<">' ); | |
returns: | |
( '<A HREF=">>>>">', 'link</A>', "" ) | |
as expected. Without the specification of C<"> as an embedded quoter: | |
@result = extract_bracketed( $text, '<>' ); | |
the result would be: | |
( '<A HREF=">', '>>>">link</A>', "" ) | |
In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like | |
quoting (i.e. q{string}, qq{string}, etc) can be specified by including the | |
letter 'q' as a delimiter. Hence: | |
@result = extract_bracketed( $text, '<q>' ); | |
would correctly match something like this: | |
$text = '<leftop: conj /and/ conj>'; | |
See also: C<"extract_quotelike"> and C<"extract_codeblock">. | |
=item C<extract_variable> | |
C<extract_variable> extracts any valid Perl variable or | |
variable-involved expression, including scalars, arrays, hashes, array | |
accesses, hash look-ups, method calls through objects, subroutine calls | |
through subroutine references, etc. | |
The subroutine takes up to two optional arguments: | |
=over 4 | |
=item 1. | |
A string to be processed (C<$_> if the string is omitted or C<undef>) | |
=item 2. | |
A string specifying a pattern to be matched as a prefix (which is to be | |
skipped). If omitted, optional whitespace is skipped. | |
=back | |
On success in a list context, an array of 3 elements is returned. The | |
elements are: | |
=over 4 | |
=item [0] | |
the extracted variable, or variablish expression | |
=item [1] | |
the remainder of the input text, | |
=item [2] | |
the prefix substring (if any), | |
=back | |
On failure, all of these values (except the remaining text) are C<undef>. | |
In a scalar context, C<extract_variable> returns just the complete | |
substring that matched a variablish expression. C<undef> is returned on | |
failure. In addition, the original input text has the returned substring | |
(and any prefix) removed from it. | |
In a void context, the input text just has the matched substring (and | |
any specified prefix) removed. | |
=item C<extract_tagged> | |
C<extract_tagged> extracts and segments text between (balanced) | |
specified tags. | |
The subroutine takes up to five optional arguments: | |
=over 4 | |
=item 1. | |
A string to be processed (C<$_> if the string is omitted or C<undef>) | |
=item 2. | |
A string specifying a pattern to be matched as the opening tag. | |
If the pattern string is omitted (or C<undef>) then a pattern | |
that matches any standard XML tag is used. | |
=item 3. | |
A string specifying a pattern to be matched at the closing tag. | |
If the pattern string is omitted (or C<undef>) then the closing | |
tag is constructed by inserting a C</> after any leading bracket | |
characters in the actual opening tag that was matched (I<not> the pattern | |
that matched the tag). For example, if the opening tag pattern | |
is specified as C<'{{\w+}}'> and actually matched the opening tag | |
C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. | |
=item 4. | |
A string specifying a pattern to be matched as a prefix (which is to be | |
skipped). If omitted, optional whitespace is skipped. | |
=item 5. | |
A hash reference containing various parsing options (see below) | |
=back | |
The various options that can be specified are: | |
=over 4 | |
=item C<reject =E<gt> $listref> | |
The list reference contains one or more strings specifying patterns | |
that must I<not> appear within the tagged text. | |
For example, to extract | |
an HTML link (which should not contain nested links) use: | |
extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); | |
=item C<ignore =E<gt> $listref> | |
The list reference contains one or more strings specifying patterns | |
that are I<not> to be treated as nested tags within the tagged text | |
(even if they would match the start tag pattern). | |
For example, to extract an arbitrary XML tag, but ignore "empty" elements: | |
extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); | |
(also see L<"gen_delimited_pat"> below). | |
=item C<fail =E<gt> $str> | |
The C<fail> option indicates the action to be taken if a matching end | |
tag is not encountered (i.e. before the end of the string or some | |
C<reject> pattern matches). By default, a failure to match a closing | |
tag causes C<extract_tagged> to immediately fail. | |
However, if the string value associated with <reject> is "MAX", then | |
C<extract_tagged> returns the complete text up to the point of failure. | |
If the string is "PARA", C<extract_tagged> returns only the first paragraph | |
after the tag (up to the first line that is either empty or contains | |
only whitespace characters). | |
If the string is "", the default behaviour (i.e. failure) is reinstated. | |
For example, suppose the start tag "/para" introduces a paragraph, which then | |
continues until the next "/endpara" tag or until another "/para" tag is | |
encountered: | |
$text = "/para line 1\n\nline 3\n/para line 4"; | |
extract_tagged($text, '/para', '/endpara', undef, | |
{reject => '/para', fail => MAX ); | |
# EXTRACTED: "/para line 1\n\nline 3\n" | |
Suppose instead, that if no matching "/endpara" tag is found, the "/para" | |
tag refers only to the immediately following paragraph: | |
$text = "/para line 1\n\nline 3\n/para line 4"; | |
extract_tagged($text, '/para', '/endpara', undef, | |
{reject => '/para', fail => MAX ); | |
# EXTRACTED: "/para line 1\n" | |
Note that the specified C<fail> behaviour applies to nested tags as well. | |
=back | |
On success in a list context, an array of 6 elements is returned. The elements are: | |
=over 4 | |
=item [0] | |
the extracted tagged substring (including the outermost tags), | |
=item [1] | |
the remainder of the input text, | |
=item [2] | |
the prefix substring (if any), | |
=item [3] | |
the opening tag | |
=item [4] | |
the text between the opening and closing tags | |
=item [5] | |
the closing tag (or "" if no closing tag was found) | |
=back | |
On failure, all of these values (except the remaining text) are C<undef>. | |
In a scalar context, C<extract_tagged> returns just the complete | |
substring that matched a tagged text (including the start and end | |
tags). C<undef> is returned on failure. In addition, the original input | |
text has the returned substring (and any prefix) removed from it. | |
In a void context, the input text just has the matched substring (and | |
any specified prefix) removed. | |
=item C<gen_extract_tagged> | |
C<gen_extract_tagged> generates a new anonymous subroutine which | |
extracts text between (balanced) specified tags. In other words, | |
it generates a function identical in function to C<extract_tagged>. | |
The difference between C<extract_tagged> and the anonymous | |
subroutines generated by | |
C<gen_extract_tagged>, is that those generated subroutines: | |
=over 4 | |
=item * | |
do not have to reparse tag specification or parsing options every time | |
they are called (whereas C<extract_tagged> has to effectively rebuild | |
its tag parser on every call); | |
=item * | |
make use of the new qr// construct to pre-compile the regexes they use | |
(whereas C<extract_tagged> uses standard string variable interpolation | |
to create tag-matching patterns). | |
=back | |
The subroutine takes up to four optional arguments (the same set as | |
C<extract_tagged> except for the string to be processed). It returns | |
a reference to a subroutine which in turn takes a single argument (the text to | |
be extracted from). | |
In other words, the implementation of C<extract_tagged> is exactly | |
equivalent to: | |
sub extract_tagged | |
{ | |
my $text = shift; | |
$extractor = gen_extract_tagged(@_); | |
return $extractor->($text); | |
} | |
(although C<extract_tagged> is not currently implemented that way). | |
Using C<gen_extract_tagged> to create extraction functions for specific tags | |
is a good idea if those functions are going to be called more than once, since | |
their performance is typically twice as good as the more general-purpose | |
C<extract_tagged>. | |
=item C<extract_quotelike> | |
C<extract_quotelike> attempts to recognize, extract, and segment any | |
one of the various Perl quotes and quotelike operators (see | |
L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket | |
delimiters (for the quotelike operators), and trailing modifiers are | |
all caught. For example, in: | |
extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' | |
extract_quotelike ' "You said, \"Use sed\"." ' | |
extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' | |
extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' | |
the full Perl quotelike operations are all extracted correctly. | |
Note too that, when using the /x modifier on a regex, any comment | |
containing the current pattern delimiter will cause the regex to be | |
immediately terminated. In other words: | |
'm / | |
(?i) # CASE INSENSITIVE | |
[a-z_] # LEADING ALPHABETIC/UNDERSCORE | |
[a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS | |
/x' | |
will be extracted as if it were: | |
'm / | |
(?i) # CASE INSENSITIVE | |
[a-z_] # LEADING ALPHABETIC/' | |
This behaviour is identical to that of the actual compiler. | |
C<extract_quotelike> takes two arguments: the text to be processed and | |
a prefix to be matched at the very beginning of the text. If no prefix | |
is specified, optional whitespace is the default. If no text is given, | |
C<$_> is used. | |
In a list context, an array of 11 elements is returned. The elements are: | |
=over 4 | |
=item [0] | |
the extracted quotelike substring (including trailing modifiers), | |
=item [1] | |
the remainder of the input text, | |
=item [2] | |
the prefix substring (if any), | |
=item [3] | |
the name of the quotelike operator (if any), | |
=item [4] | |
the left delimiter of the first block of the operation, | |
=item [5] | |
the text of the first block of the operation | |
(that is, the contents of | |
a quote, the regex of a match or substitution or the target list of a | |
translation), | |
=item [6] | |
the right delimiter of the first block of the operation, | |
=item [7] | |
the left delimiter of the second block of the operation | |
(that is, if it is a C<s>, C<tr>, or C<y>), | |
=item [8] | |
the text of the second block of the operation | |
(that is, the replacement of a substitution or the translation list | |
of a translation), | |
=item [9] | |
the right delimiter of the second block of the operation (if any), | |
=item [10] | |
the trailing modifiers on the operation (if any). | |
=back | |
For each of the fields marked "(if any)" the default value on success is | |
an empty string. | |
On failure, all of these values (except the remaining text) are C<undef>. | |
In a scalar context, C<extract_quotelike> returns just the complete substring | |
that matched a quotelike operation (or C<undef> on failure). In a scalar or | |
void context, the input text has the same substring (and any specified | |
prefix) removed. | |
Examples: | |
# Remove the first quotelike literal that appears in text | |
$quotelike = extract_quotelike($text,'.*?'); | |
# Replace one or more leading whitespace-separated quotelike | |
# literals in $_ with "<QLL>" | |
do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; | |
# Isolate the search pattern in a quotelike operation from $text | |
($op,$pat) = (extract_quotelike $text)[3,5]; | |
if ($op =~ /[ms]/) | |
{ | |
print "search pattern: $pat\n"; | |
} | |
else | |
{ | |
print "$op is not a pattern matching operation\n"; | |
} | |
=item C<extract_quotelike> | |
C<extract_quotelike> can successfully extract "here documents" from an input | |
string, but with an important caveat in list contexts. | |
Unlike other types of quote-like literals, a here document is rarely | |
a contiguous substring. For example, a typical piece of code using | |
here document might look like this: | |
<<'EOMSG' || die; | |
This is the message. | |
EOMSG | |
exit; | |
Given this as an input string in a scalar context, C<extract_quotelike> | |
would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", | |
leaving the string " || die;\nexit;" in the original variable. In other words, | |
the two separate pieces of the here document are successfully extracted and | |
concatenated. | |
In a list context, C<extract_quotelike> would return the list | |
=over 4 | |
=item [0] | |
"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, | |
including fore and aft delimiters), | |
=item [1] | |
" || die;\nexit;" (i.e. the remainder of the input text, concatenated), | |
=item [2] | |
"" (i.e. the prefix substring -- trivial in this case), | |
=item [3] | |
"<<" (i.e. the "name" of the quotelike operator) | |
=item [4] | |
"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), | |
=item [5] | |
"This is the message.\n" (i.e. the text of the here document), | |
=item [6] | |
"EOMSG" (i.e. the right delimiter of the here document), | |
=item [7..10] | |
"" (a here document has no second left delimiter, second text, second right | |
delimiter, or trailing modifiers). | |
=back | |
However, the matching position of the input variable would be set to | |
"exit;" (i.e. I<after> the closing delimiter of the here document), | |
which would cause the earlier " || die;\nexit;" to be skipped in any | |
sequence of code fragment extractions. | |
To avoid this problem, when it encounters a here document whilst | |
extracting from a modifiable string, C<extract_quotelike> silently | |
rearranges the string to an equivalent piece of Perl: | |
<<'EOMSG' | |
This is the message. | |
EOMSG | |
|| die; | |
exit; | |
in which the here document I<is> contiguous. It still leaves the | |
matching position after the here document, but now the rest of the line | |
on which the here document starts is not skipped. | |
To prevent <extract_quotelike> from mucking about with the input in this way | |
(this is the only case where a list-context C<extract_quotelike> does so), | |
you can pass the input variable as an interpolated literal: | |
$quotelike = extract_quotelike("$var"); | |
=item C<extract_codeblock> | |
C<extract_codeblock> attempts to recognize and extract a balanced | |
bracket delimited substring that may contain unbalanced brackets | |
inside Perl quotes or quotelike operations. That is, C<extract_codeblock> | |
is like a combination of C<"extract_bracketed"> and | |
C<"extract_quotelike">. | |
C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: | |
a text to process, a set of delimiter brackets to look for, and a prefix to | |
match first. It also takes an optional fourth parameter, which allows the | |
outermost delimiter brackets to be specified separately (see below). | |
Omitting the first argument (input text) means process C<$_> instead. | |
Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. | |
Omitting the third argument (prefix argument) implies optional whitespace at the start. | |
Omitting the fourth argument (outermost delimiter brackets) indicates that the | |
value of the second argument is to be used for the outermost delimiters. | |
Once the prefix and the outermost opening delimiter bracket have been | |
recognized, code blocks are extracted by stepping through the input text and | |
trying the following alternatives in sequence: | |
=over 4 | |
=item 1. | |
Try and match a closing delimiter bracket. If the bracket was the same | |
species as the last opening bracket, return the substring to that | |
point. If the bracket was mismatched, return an error. | |
=item 2. | |
Try to match a quote or quotelike operator. If found, call | |
C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return | |
the error it returned. Otherwise go back to step 1. | |
=item 3. | |
Try to match an opening delimiter bracket. If found, call | |
C<extract_codeblock> recursively to eat the embedded block. If the | |
recursive call fails, return an error. Otherwise, go back to step 1. | |
=item 4. | |
Unconditionally match a bareword or any other single character, and | |
then go back to step 1. | |
=back | |
Examples: | |
# Find a while loop in the text | |
if ($text =~ s/.*?while\s*\{/{/) | |
{ | |
$loop = "while " . extract_codeblock($text); | |
} | |
# Remove the first round-bracketed list (which may include | |
# round- or curly-bracketed code blocks or quotelike operators) | |
extract_codeblock $text, "(){}", '[^(]*'; | |
The ability to specify a different outermost delimiter bracket is useful | |
in some circumstances. For example, in the Parse::RecDescent module, | |
parser actions which are to be performed only on a successful parse | |
are specified using a C<E<lt>defer:...E<gt>> directive. For example: | |
sentence: subject verb object | |
<defer: {$::theVerb = $item{verb}} > | |
Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code | |
within the C<E<lt>defer:...E<gt>> directive, but there's a problem. | |
A deferred action like this: | |
<defer: {if ($count>10) {$count--}} > | |
will be incorrectly parsed as: | |
<defer: {if ($count> | |
because the "less than" operator is interpreted as a closing delimiter. | |
But, by extracting the directive using | |
S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> | |
the '>' character is only treated as a delimited at the outermost | |
level of the code block, so the directive is parsed correctly. | |
=item C<extract_multiple> | |
The C<extract_multiple> subroutine takes a string to be processed and a | |
list of extractors (subroutines or regular expressions) to apply to that string. | |
In an array context C<extract_multiple> returns an array of substrings | |
of the original string, as extracted by the specified extractors. | |
In a scalar context, C<extract_multiple> returns the first | |
substring successfully extracted from the original string. In both | |
scalar and void contexts the original string has the first successfully | |
extracted substring removed from it. In all contexts | |
C<extract_multiple> starts at the current C<pos> of the string, and | |
sets that C<pos> appropriately after it matches. | |
Hence, the aim of a call to C<extract_multiple> in a list context | |
is to split the processed string into as many non-overlapping fields as | |
possible, by repeatedly applying each of the specified extractors | |
to the remainder of the string. Thus C<extract_multiple> is | |
a generalized form of Perl's C<split> subroutine. | |
The subroutine takes up to four optional arguments: | |
=over 4 | |
=item 1. | |
A string to be processed (C<$_> if the string is omitted or C<undef>) | |
=item 2. | |
A reference to a list of subroutine references and/or qr// objects and/or | |
literal strings and/or hash references, specifying the extractors | |
to be used to split the string. If this argument is omitted (or | |
C<undef>) the list: | |
[ | |
sub { extract_variable($_[0], '') }, | |
sub { extract_quotelike($_[0],'') }, | |
sub { extract_codeblock($_[0],'{}','') }, | |
] | |
is used. | |
=item 3. | |
An number specifying the maximum number of fields to return. If this | |
argument is omitted (or C<undef>), split continues as long as possible. | |
If the third argument is I<N>, then extraction continues until I<N> fields | |
have been successfully extracted, or until the string has been completely | |
processed. | |
Note that in scalar and void contexts the value of this argument is | |
automatically reset to 1 (under C<-w>, a warning is issued if the argument | |
has to be reset). | |
=item 4. | |
A value indicating whether unmatched substrings (see below) within the | |
text should be skipped or returned as fields. If the value is true, | |
such substrings are skipped. Otherwise, they are returned. | |
=back | |
The extraction process works by applying each extractor in | |
sequence to the text string. | |
If the extractor is a subroutine it is called in a list context and is | |
expected to return a list of a single element, namely the extracted | |
text. It may optionally also return two further arguments: a string | |
representing the text left after extraction (like $' for a pattern | |
match), and a string representing any prefix skipped before the | |
extraction (like $` in a pattern match). Note that this is designed | |
to facilitate the use of other Text::Balanced subroutines with | |
C<extract_multiple>. Note too that the value returned by an extractor | |
subroutine need not bear any relationship to the corresponding substring | |
of the original text (see examples below). | |
If the extractor is a precompiled regular expression or a string, | |
it is matched against the text in a scalar context with a leading | |
'\G' and the gc modifiers enabled. The extracted value is either | |
$1 if that variable is defined after the match, or else the | |
complete match (i.e. $&). | |
If the extractor is a hash reference, it must contain exactly one element. | |
The value of that element is one of the | |
above extractor types (subroutine reference, regular expression, or string). | |
The key of that element is the name of a class into which the successful | |
return value of the extractor will be blessed. | |
If an extractor returns a defined value, that value is immediately | |
treated as the next extracted field and pushed onto the list of fields. | |
If the extractor was specified in a hash reference, the field is also | |
blessed into the appropriate class, | |
If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is | |
assumed to have failed to extract. | |
If none of the extractor subroutines succeeds, then one | |
character is extracted from the start of the text and the extraction | |
subroutines reapplied. Characters which are thus removed are accumulated and | |
eventually become the next field (unless the fourth argument is true, in which | |
case they are discarded). | |
For example, the following extracts substrings that are valid Perl variables: | |
@fields = extract_multiple($text, | |
[ sub { extract_variable($_[0]) } ], | |
undef, 1); | |
This example separates a text into fields which are quote delimited, | |
curly bracketed, and anything else. The delimited and bracketed | |
parts are also blessed to identify them (the "anything else" is unblessed): | |
@fields = extract_multiple($text, | |
[ | |
{ Delim => sub { extract_delimited($_[0],q{'"}) } }, | |
{ Brack => sub { extract_bracketed($_[0],'{}') } }, | |
]); | |
This call extracts the next single substring that is a valid Perl quotelike | |
operator (and removes it from $text): | |
$quotelike = extract_multiple($text, | |
[ | |
sub { extract_quotelike($_[0]) }, | |
], undef, 1); | |
Finally, here is yet another way to do comma-separated value parsing: | |
@fields = extract_multiple($csv_text, | |
[ | |
sub { extract_delimited($_[0],q{'"}) }, | |
qr/([^,]+)(.*)/, | |
], | |
undef,1); | |
The list in the second argument means: | |
I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. | |
The undef third argument means: | |
I<"...as many times as possible...">, | |
and the true value in the fourth argument means | |
I<"...discarding anything else that appears (i.e. the commas)">. | |
If you wanted the commas preserved as separate fields (i.e. like split | |
does if your split pattern has capturing parentheses), you would | |
just make the last parameter undefined (or remove it). | |
=item C<gen_delimited_pat> | |
The C<gen_delimited_pat> subroutine takes a single (string) argument and | |
> builds a Friedl-style optimized regex that matches a string delimited | |
by any one of the characters in the single argument. For example: | |
gen_delimited_pat(q{'"}) | |
returns the regex: | |
(?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') | |
Note that the specified delimiters are automatically quotemeta'd. | |
A typical use of C<gen_delimited_pat> would be to build special purpose tags | |
for C<extract_tagged>. For example, to properly ignore "empty" XML elements | |
(which might contain quoted strings): | |
my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; | |
extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); | |
C<gen_delimited_pat> may also be called with an optional second argument, | |
which specifies the "escape" character(s) to be used for each delimiter. | |
For example to match a Pascal-style string (where ' is the delimiter | |
and '' is a literal ' within the string): | |
gen_delimited_pat(q{'},q{'}); | |
Different escape characters can be specified for different delimiters. | |
For example, to specify that '/' is the escape for single quotes | |
and '%' is the escape for double quotes: | |
gen_delimited_pat(q{'"},q{/%}); | |
If more delimiters than escape chars are specified, the last escape char | |
is used for the remaining delimiters. | |
If no escape char is specified for a given specified delimiter, '\' is used. | |
=item C<delimited_pat> | |
Note that C<gen_delimited_pat> was previously called C<delimited_pat>. | |
That name may still be used, but is now deprecated. | |
=back | |
=head1 DIAGNOSTICS | |
In a list context, all the functions return C<(undef,$original_text)> | |
on failure. In a scalar context, failure is indicated by returning C<undef> | |
(in this case the input text is not modified in any way). | |
In addition, on failure in I<any> context, the C<$@> variable is set. | |
Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed | |
below. | |
Accessing C<$@-E<gt>{pos}> returns the offset into the original string at | |
which the error was detected (although not necessarily where it occurred!) | |
Printing C<$@> directly produces the error message, with the offset appended. | |
On success, the C<$@> variable is guaranteed to be C<undef>. | |
The available diagnostics are: | |
=over 4 | |
=item C<Did not find a suitable bracket: "%s"> | |
The delimiter provided to C<extract_bracketed> was not one of | |
C<'()[]E<lt>E<gt>{}'>. | |
=item C<Did not find prefix: /%s/> | |
A non-optional prefix was specified but wasn't found at the start of the text. | |
=item C<Did not find opening bracket after prefix: "%s"> | |
C<extract_bracketed> or C<extract_codeblock> was expecting a | |
particular kind of bracket at the start of the text, and didn't find it. | |
=item C<No quotelike operator found after prefix: "%s"> | |
C<extract_quotelike> didn't find one of the quotelike operators C<q>, | |
C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring | |
it was extracting. | |
=item C<Unmatched closing bracket: "%c"> | |
C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered | |
a closing bracket where none was expected. | |
=item C<Unmatched opening bracket(s): "%s"> | |
C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran | |
out of characters in the text before closing one or more levels of nested | |
brackets. | |
=item C<Unmatched embedded quote (%s)> | |
C<extract_bracketed> attempted to match an embedded quoted substring, but | |
failed to find a closing quote to match it. | |
=item C<Did not find closing delimiter to match '%s'> | |
C<extract_quotelike> was unable to find a closing delimiter to match the | |
one that opened the quote-like operation. | |
=item C<Mismatched closing bracket: expected "%c" but found "%s"> | |
C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found | |
a valid bracket delimiter, but it was the wrong species. This usually | |
indicates a nesting error, but may indicate incorrect quoting or escaping. | |
=item C<No block delimiter found after quotelike "%s"> | |
C<extract_quotelike> or C<extract_codeblock> found one of the | |
quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> | |
without a suitable block after it. | |
=item C<Did not find leading dereferencer> | |
C<extract_variable> was expecting one of '$', '@', or '%' at the start of | |
a variable, but didn't find any of them. | |
=item C<Bad identifier after dereferencer> | |
C<extract_variable> found a '$', '@', or '%' indicating a variable, but that | |
character was not followed by a legal Perl identifier. | |
=item C<Did not find expected opening bracket at %s> | |
C<extract_codeblock> failed to find any of the outermost opening brackets | |
that were specified. | |
=item C<Improperly nested codeblock at %s> | |
A nested code block was found that started with a delimiter that was specified | |
as being only to be used as an outermost bracket. | |
=item C<Missing second block for quotelike "%s"> | |
C<extract_codeblock> or C<extract_quotelike> found one of the | |
quotelike operators C<s>, C<tr> or C<y> followed by only one block. | |
=item C<No match found for opening bracket> | |
C<extract_codeblock> failed to find a closing bracket to match the outermost | |
opening bracket. | |
=item C<Did not find opening tag: /%s/> | |
C<extract_tagged> did not find a suitable opening tag (after any specified | |
prefix was removed). | |
=item C<Unable to construct closing tag to match: /%s/> | |
C<extract_tagged> matched the specified opening tag and tried to | |
modify the matched text to produce a matching closing tag (because | |
none was specified). It failed to generate the closing tag, almost | |
certainly because the opening tag did not start with a | |
bracket of some kind. | |
=item C<Found invalid nested tag: %s> | |
C<extract_tagged> found a nested tag that appeared in the "reject" list | |
(and the failure mode was not "MAX" or "PARA"). | |
=item C<Found unbalanced nested tag: %s> | |
C<extract_tagged> found a nested opening tag that was not matched by a | |
corresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). | |
=item C<Did not find closing tag> | |
C<extract_tagged> reached the end of the text without finding a closing tag | |
to match the original opening tag (and the failure mode was not | |
"MAX" or "PARA"). | |
=back | |
=head1 EXPORTS | |
The following symbols are, or can be, exported by this module: | |
=over 4 | |
=item Default Exports | |
I<None>. | |
=item Optional Exports | |
C<extract_delimited>, | |
C<extract_bracketed>, | |
C<extract_quotelike>, | |
C<extract_codeblock>, | |
C<extract_variable>, | |
C<extract_tagged>, | |
C<extract_multiple>, | |
C<gen_delimited_pat>, | |
C<gen_extract_tagged>, | |
C<delimited_pat>. | |
=item Export Tags | |
=over 4 | |
=item C<:ALL> | |
C<extract_delimited>, | |
C<extract_bracketed>, | |
C<extract_quotelike>, | |
C<extract_codeblock>, | |
C<extract_variable>, | |
C<extract_tagged>, | |
C<extract_multiple>, | |
C<gen_delimited_pat>, | |
C<gen_extract_tagged>, | |
C<delimited_pat>. | |
=back | |
=back | |
=head1 KNOWN BUGS | |
See L<https://rt.cpan.org/Dist/Display.html?Status=Active&Queue=Text-Balanced>. | |
=head1 FEEDBACK | |
Patches, bug reports, suggestions or any other feedback is welcome. | |
Patches can be sent as GitHub pull requests at | |
L<https://github.com/steve-m-hay/Text-Balanced/pulls>. | |
Bug reports and suggestions can be made on the CPAN Request Tracker at | |
L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Text-Balanced>. | |
Currently active requests on the CPAN Request Tracker can be viewed at | |
L<https://rt.cpan.org/Public/Dist/Display.html?Status=Active;Queue=Text-Balanced>. | |
Please test this distribution. See CPAN Testers Reports at | |
L<https://www.cpantesters.org/> for details of how to get involved. | |
Previous test results on CPAN Testers Reports can be viewed at | |
L<https://www.cpantesters.org/distro/T/Text-Balanced.html>. | |
Please rate this distribution on CPAN Ratings at | |
L<https://cpanratings.perl.org/rate/?distribution=Text-Balanced>. | |
=head1 AVAILABILITY | |
The latest version of this module is available from CPAN (see | |
L<perlmodlib/"CPAN"> for details) at | |
L<https://metacpan.org/release/Text-Balanced> or | |
L<https://www.cpan.org/authors/id/S/SH/SHAY/> or | |
L<https://www.cpan.org/modules/by-module/Text/>. | |
The latest source code is available from GitHub at | |
L<https://github.com/steve-m-hay/Text-Balanced>. | |
=head1 INSTALLATION | |
See the F<INSTALL> file. | |
=head1 AUTHOR | |
Damian Conway E<lt>L<[email protected]|mailto:[email protected]>E<gt>. | |
Steve Hay E<lt>L<[email protected]|mailto:[email protected]>E<gt> is now maintaining | |
Text::Balanced as of version 2.03. | |
=head1 COPYRIGHT | |
Copyright (C) 1997-2001 Damian Conway. All rights reserved. | |
Copyright (C) 2009 Adam Kennedy. | |
Copyright (C) 2015, 2020 Steve Hay. All rights reserved. | |
=head1 LICENCE | |
This module is free software; you can redistribute it and/or modify it under the | |
same terms as Perl itself, i.e. under the terms of either the GNU General Public | |
License or the Artistic License, as specified in the F<LICENCE> file. | |
=head1 VERSION | |
Version 2.04 | |
=head1 DATE | |
11 Dec 2020 | |
=head1 HISTORY | |
See the F<Changes> file. | |
=cut | |