Spaces:
Running
Running
#!/usr/bin/env perl | |
# | |
# This file is part of moses. Its use is licensed under the GNU Lesser General | |
# Public License version 2.1 or, at your option, any later version. | |
use warnings; | |
use strict; | |
my $language = "en"; | |
my $PENN = 0; | |
while (@ARGV) { | |
$_ = shift; | |
/^-b$/ && ($| = 1, next); # not buffered (flush each line) | |
/^-l$/ && ($language = shift, next); | |
/^[^\-]/ && ($language = $_, next); | |
/^-penn$/ && ($PENN = 1, next); | |
} | |
while(<STDIN>) { | |
s/\r//g; | |
# remove extra spaces | |
s/\(/ \(/g; | |
s/\)/\) /g; s/ +/ /g; | |
s/\) ([\.\!\:\?\;\,])/\)$1/g; | |
s/\( /\(/g; | |
s/ \)/\)/g; | |
s/(\d) \%/$1\%/g; | |
s/ :/:/g; | |
s/ ;/;/g; | |
# normalize unicode punctuation | |
if ($PENN == 0) { | |
s/\`/\'/g; | |
s/\'\'/ \" /g; | |
} | |
s/„/\"/g; | |
s/“/\"/g; | |
s/”/\"/g; | |
s/–/-/g; | |
s/—/ - /g; s/ +/ /g; | |
s/´/\'/g; | |
s/([a-z])‘([a-z])/$1\'$2/gi; | |
s/([a-z])’([a-z])/$1\'$2/gi; | |
s/‘/\'/g; | |
s/‚/\'/g; | |
s/’/\"/g; | |
s/''/\"/g; | |
s/´´/\"/g; | |
s/…/.../g; | |
# French quotes | |
s/ « / \"/g; | |
s/« /\"/g; | |
s/«/\"/g; | |
s/ » /\" /g; | |
s/ »/\"/g; | |
s/»/\"/g; | |
# handle pseudo-spaces | |
s/ \%/\%/g; | |
s/nº /nº /g; | |
s/ :/:/g; | |
s/ ºC/ ºC/g; | |
s/ cm/ cm/g; | |
s/ \?/\?/g; | |
s/ \!/\!/g; | |
s/ ;/;/g; | |
s/, /, /g; s/ +/ /g; | |
# English "quotation," followed by comma, style | |
if ($language eq "en") { | |
s/\"([,\.]+)/$1\"/g; | |
} | |
# Czech is confused | |
elsif ($language eq "cs" || $language eq "cz") { | |
} | |
# German/Spanish/French "quotation", followed by comma, style | |
else { | |
s/,\"/\",/g; | |
s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence | |
} | |
if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { | |
s/(\d) (\d)/$1,$2/g; | |
} | |
else { | |
s/(\d) (\d)/$1.$2/g; | |
} | |
print $_; | |
} | |