|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$|++; |
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
use strict; |
|
|
|
my $language = "en"; |
|
my $QUIET = 0; |
|
my $HELP = 0; |
|
|
|
while (@ARGV) { |
|
$_ = shift; |
|
/^-l$/ && ($language = shift, next); |
|
/^-q$/ && ($QUIET = 1, next); |
|
/^-h$/ && ($HELP = 1, next); |
|
} |
|
|
|
if ($HELP) { |
|
print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n"; |
|
exit; |
|
} |
|
if (!$QUIET) { |
|
print STDERR "Detokenizer Version 1.0\n"; |
|
print STDERR "Language: $language\n"; |
|
} |
|
|
|
while(<STDIN>) { |
|
if (/^<.+>$/ || /^\s*$/) { |
|
|
|
print $_; |
|
} |
|
else { |
|
print &detokenize($_); |
|
} |
|
} |
|
|
|
sub detokenize { |
|
my($text) = @_; |
|
chomp($text); |
|
$text = " $text "; |
|
|
|
my $word; |
|
my $i; |
|
my @words = split(/ /,$text); |
|
$text = ""; |
|
my %quoteCount = ("\'"=>0,"\""=>0); |
|
my $prependSpace = " "; |
|
for ($i=0;$i<(scalar(@words));$i++) { |
|
if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { |
|
|
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = ""; |
|
} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} elsif (($language eq "fr") && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { |
|
|
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = ""; |
|
} elsif ($words[$i] =~ /^[\'\"]+$/) { |
|
|
|
if (($quoteCount{$words[$i]} % 2) eq 0) { |
|
if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) { |
|
|
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
} else { |
|
|
|
$text = $text.$prependSpace.$words[$i]; |
|
$prependSpace = ""; |
|
$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1; |
|
|
|
} |
|
} else { |
|
|
|
$text=$text.$words[$i]; |
|
$prependSpace = " "; |
|
$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1; |
|
|
|
} |
|
|
|
} else { |
|
$text=$text.$prependSpace.$words[$i]; |
|
$prependSpace = " "; |
|
} |
|
} |
|
|
|
|
|
$text =~ s/ +/ /g; |
|
$text =~ s/\n /\n/g; |
|
$text =~ s/ \n/\n/g; |
|
$text =~ s/^ //g; |
|
$text =~ s/ $//g; |
|
|
|
|
|
$text .= "\n" unless $text =~ /\n$/; |
|
|
|
return $text; |
|
} |
|
|
|
|