NMTKD / translation /tools /mosesdecoder /scripts /tokenizer /replace-unicode-punctuation.perl
sakharamg's picture
Uploading all files
158b61b
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
while (@ARGV) {
$_ = shift;
/^-b$/ && ($| = 1, next); # not buffered (flush each line)
}
#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");
while(<STDIN>) {
s/,/,/g;
s/。 */. /g;
s/、/,/g;
s/”/"/g;
s/“/"/g;
s/∶/:/g;
s/:/:/g;
s/?/\?/g;
s/《/"/g;
s/》/"/g;
s/)/\)/g;
s/!/\!/g;
s/(/\(/g;
s/;/;/g;
s/1/1/g;
s/」/"/g;
s/「/"/g;
s/0/0/g;
s/3/3/g;
s/2/2/g;
s/5/5/g;
s/6/6/g;
s/9/9/g;
s/7/7/g;
s/8/8/g;
s/4/4/g;
s/. */. /g;
s/~/\~/g;
s/’/\'/g;
s/…/\.\.\./g;
s/━/\-/g;
s/〈/\</g;
s/〉/\>/g;
s/【/\[/g;
s/】/\]/g;
s/%/\%/g;
print $_;
}