|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
|
|
use warnings; |
|
use strict; |
|
use utf8; |
|
|
|
my $language = "en"; |
|
my $QUIET = 0; |
|
my $HELP = 0; |
|
my $UPPERCASE_SENT = 0; |
|
my $PENN = 0; |
|
|
|
while (@ARGV) { |
|
$_ = shift; |
|
/^-b$/ && ($| = 1, next); |
|
/^-l$/ && ($language = shift, next); |
|
/^-q$/ && ($QUIET = 1, next); |
|
/^-h$/ && ($HELP = 1, next); |
|
/^-u$/ && ($UPPERCASE_SENT = 1, next); |
|
/^-penn$/ && ($PENN = 1, next); |
|
} |
|
|
|
if ($HELP) { |
|
print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n"; |
|
print "Options:\n"; |
|
print " -u ... uppercase the first char in the final sentence.\n"; |
|
print " -q ... don't report detokenizer revision.\n"; |
|
print " -b ... disable Perl buffering.\n"; |
|
print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n"; |
|
exit; |
|
} |
|
|
|
if ($language !~ /^(cs|en|fr|it|fi)$/) { |
|
print STDERR "Warning: No built-in rules for language $language.\n" |
|
} |
|
|
|
if ($PENN && $language ne "en") { |
|
print STDERR "Error: -penn option only supported for English text.\n"; |
|
exit; |
|
} |
|
|
|
if (!$QUIET) { |
|
print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n"; |
|
print STDERR "Language: $language\n"; |
|
} |
|
|
|
while(<STDIN>) { |
|
if (/^<.+>$/ || /^\s*$/) { |
|
|
|
print $_; |
|
} elsif ($PENN) { |
|
print &detokenize_penn($_); |
|
} else { |
|
print &detokenize($_); |
|
} |
|
} |
|
|
|
|
|
sub ucsecondarg { |
|
|
|
my $arg1 = shift; |
|
my $arg2 = shift; |
|
return $arg1.uc($arg2); |
|
} |
|
|
|
sub deescape { |
|
|
|
my ($text) = @_; |
|
$text =~ s/\&bar;/\|/g; |
|
$text =~ s/\|/\|/g; |
|
$text =~ s/\</\</g; |
|
$text =~ s/\>/\>/g; |
|
$text =~ s/\&bra;/\[/g; |
|
$text =~ s/\&ket;/\]/g; |
|
$text =~ s/\"/\"/g; |
|
$text =~ s/\'/\'/g; |
|
$text =~ s/\[/\[/g; |
|
$text =~ s/\]/\]/g; |
|
$text =~ s/\&/\&/g; |
|
return $text; |
|
} |
|
|
|
sub detokenize { |
|
my($text) = @_; |
|
chomp($text); |
|
$text = " $text "; |
|
$text =~ s/ \@\-\@ /-/g; |
|
$text = &deescape($text); |
|
|
|
my $word; |
|
my $i; |
|
my @words = split(/ /,$text); |
|
$text = ""; |
|
my %quoteCount = ("\'"=>0,"\""=>0); |
|
my $prependSpace = " "; |
|
for ($i=0;$i<(scalar(@words));$i++) { |
|
if (&startsWithCJKChar($words[$i])) { |
|
if (($i > 0 && &endsWithCJKChar($words[$i-1])) && ($language ne "ko")) { |
|
|
|
$text=$text.$words[$i]; |
|
} else { |
|
|
|
$text=$text.$prependSpace.$words[$i]; |
|
} |
|
$prependSpace = " "; |
|
} elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { |
|
|
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = ""; |
|
} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ |
|
if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) { |
|
|
|
$text .= " "; } |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) { |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { |
|
|
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = ""; |
|
} elsif (($language eq "cs") && ($i<(scalar(@words)-3)) |
|
&& ($words[$i] =~ /[\p{IsAlpha}]$/) |
|
&& ($words[$i+1] =~ /^[-–]$/) |
|
&& ($words[$i+2] =~ /^li$|^mail.*/i) |
|
) { |
|
|
|
$text = $text.$prependSpace.$words[$i].$words[$i+1]; |
|
$i++; |
|
$prependSpace = ""; |
|
} elsif ($words[$i] =~ /^[\'\"„“`]+$/) { |
|
|
|
my $normalized_quo = $words[$i]; |
|
$normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/; |
|
$quoteCount{$normalized_quo} = 0 |
|
if !defined $quoteCount{$normalized_quo}; |
|
if ($language eq "cs" && $words[$i] eq "„") { |
|
|
|
$quoteCount{$normalized_quo} = 0; |
|
} |
|
if ($language eq "cs" && $words[$i] eq "“") { |
|
|
|
$quoteCount{$normalized_quo} = 1; |
|
} |
|
if (($quoteCount{$normalized_quo} % 2) eq 0) { |
|
if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) { |
|
|
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} else { |
|
|
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = ""; |
|
$quoteCount{$normalized_quo} ++; |
|
|
|
} |
|
} else { |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
$quoteCount{$normalized_quo} ++; |
|
|
|
} |
|
|
|
} elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) { |
|
|
|
|
|
$text=$text. lc $words[$i]; |
|
$prependSpace = " "; |
|
} else { |
|
$text=$text.$prependSpace.$words[$i]; |
|
$prependSpace = " "; |
|
} |
|
} |
|
|
|
|
|
$text =~ s/ +/ /g; |
|
$text =~ s/\n /\n/g; |
|
$text =~ s/ \n/\n/g; |
|
$text =~ s/^ //g; |
|
$text =~ s/ $//g; |
|
|
|
|
|
$text .= "\n" unless $text =~ /\n$/; |
|
|
|
$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; |
|
|
|
return $text; |
|
} |
|
|
|
sub detokenize_penn { |
|
my($text) = @_; |
|
|
|
chomp($text); |
|
$text = " $text "; |
|
$text =~ s/ \@\-\@ /-/g; |
|
$text =~ s/ \@\/\@ /\//g; |
|
$text = &deescape($text); |
|
|
|
|
|
|
|
$text =~ s/ n't /n't /g; |
|
$text =~ s/ N'T /N'T /g; |
|
$text =~ s/ ([Cc])an not / $1annot /g; |
|
$text =~ s/ ([Dd])' ye / $1'ye /g; |
|
$text =~ s/ ([Gg])im me / $1imme /g; |
|
$text =~ s/ ([Gg])on na / $1onna /g; |
|
$text =~ s/ ([Gg])ot ta / $1otta /g; |
|
$text =~ s/ ([Ll])em me / $1emme /g; |
|
$text =~ s/ '([Tt]) is / '$1is /g; |
|
$text =~ s/ '([Tt]) was / '$1was /g; |
|
$text =~ s/ ([Ww])an na / $1anna /g; |
|
|
|
|
|
$text =~ s/-LRB-/\(/g; |
|
$text =~ s/-RRB-/\)/g; |
|
$text =~ s/-LSB-/\[/g; |
|
$text =~ s/-RSB-/\]/g; |
|
$text =~ s/-LCB-/{/g; |
|
$text =~ s/-RCB-/}/g; |
|
|
|
my $i; |
|
my @words = split(/ /,$text); |
|
$text = ""; |
|
my $prependSpace = " "; |
|
for ($i=0;$i<(scalar(@words));$i++) { |
|
if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { |
|
|
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = ""; |
|
} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} elsif ($words[$i] eq "`") { |
|
|
|
$text = $text.$prependSpace."\'"; |
|
$prependSpace = ""; |
|
} elsif ($words[$i] eq "``") { |
|
|
|
$text = $text.$prependSpace."\""; |
|
$prependSpace = ""; |
|
} elsif ($words[$i] eq "\'") { |
|
|
|
$text = $text."\'"; |
|
$prependSpace = " "; |
|
} elsif ($words[$i] eq "\'\'") { |
|
|
|
$text = $text."\""; |
|
$prependSpace = " "; |
|
} else { |
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = " "; |
|
} |
|
} |
|
|
|
|
|
$text =~ s/ +/ /g; |
|
$text =~ s/\n /\n/g; |
|
$text =~ s/ \n/\n/g; |
|
$text =~ s/^ //g; |
|
$text =~ s/ $//g; |
|
|
|
|
|
$text .= "\n" unless $text =~ /\n$/; |
|
|
|
$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; |
|
|
|
return $text; |
|
} |
|
|
|
sub startsWithCJKChar { |
|
my ($str) = @_; |
|
return 0 if length($str) == 0; |
|
my $firstChar = substr($str, 0, 1); |
|
return &charIsCJK($firstChar); |
|
} |
|
|
|
sub endsWithCJKChar { |
|
my ($str) = @_; |
|
return 0 if length($str) == 0; |
|
my $lastChar = substr($str, length($str)-1, 1); |
|
return &charIsCJK($lastChar); |
|
} |
|
|
|
|
|
|
|
sub charIsCJK { |
|
my ($char) = @_; |
|
|
|
my $codepoint = &codepoint_dec($char); |
|
|
|
|
|
|
|
|
|
return 1 if (&between_hexes($codepoint, '1100', '11FF')); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return 1 if (&between_hexes($codepoint, '2E80', 'A4CF')); |
|
|
|
|
|
return 1 if (&between_hexes($codepoint, 'A840', 'A87F')); |
|
|
|
|
|
return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF')); |
|
|
|
|
|
return 1 if (&between_hexes($codepoint, 'F900', 'FAFF')); |
|
|
|
|
|
return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F')); |
|
|
|
|
|
return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC')); |
|
|
|
|
|
return 1 if (&between_hexes($codepoint, '20000', '2FFFF')); |
|
|
|
return 0; |
|
} |
|
|
|
|
|
sub codepoint_dec { |
|
if (my $char = shift) { |
|
return unpack('U0U*', $char); |
|
} |
|
} |
|
|
|
sub between_hexes { |
|
my ($num, $left, $right) = @_; |
|
return $num >= hex($left) && $num <= hex($right); |
|
} |
|
|