File size: 4,285 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
#!/usr/bin/perl -w
# Sample Tokenizer
# written by Josh Schroeder, based on code by Philipp Koehn
# This added by Herve Saint-Amand for compatibility with translate.cgi
$|++;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use FindBin qw($Bin);
use strict;
#use Time::HiRes;
my $mydir = "$Bin/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
#my $start = [ Time::HiRes::gettimeofday( ) ];
while (@ARGV) {
$_ = shift;
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
}
if ($HELP) {
print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
exit;
}
if (!$QUIET) {
print STDERR "Tokenizer v3\n";
print STDERR "Language: $language\n";
}
load_prefixes($language,\%NONBREAKING_PREFIX);
if (scalar(%NONBREAKING_PREFIX) eq 0){
print STDERR "Warning: No known abbreviations for language '$language'\n";
}
while(<STDIN>) {
if (/^<.+>$/ || /^\s*$/) {
#don't try to tokenize XML/HTML tag lines
print $_;
}
else {
print &tokenize($_);
}
}
#my $duration = Time::HiRes::tv_interval( $start );
#print STDERR ("EXECUTION TIME: ".$duration."\n");
sub tokenize {
my($text) = @_;
chomp($text);
$text = " $text ";
# seperate out all "other" special characters
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
#multi-dots stay together
$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
while($text =~ /DOTMULTI\./) {
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
}
# seperate out "," except if within numbers (5,300)
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate , pre and post number
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
# turn `into '
$text =~ s/\`/\'/g;
#turn '' into "
$text =~ s/\'\'/ \" /g;
if ($language eq "en") {
#split contractions right
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
#special case for "1990's"
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
} elsif (($language eq "fr") or ($language eq "it")) {
#split contractions left
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
} else {
$text =~ s/\'/ \' /g;
}
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++) {
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/) {
my $pre = $1;
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
#no change
} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
#no change
} else {
$word = $pre." .";
}
}
$text .= $word." ";
}
# clean up extraneous spaces
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
#restore multi-dots
while($text =~ /DOTDOTMULTI/) {
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub load_prefixes {
my ($language, $PREFIX_REF) = @_;
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
#default back to English if we don't have a language-specific prefix file
if (!(-e $prefixfile)) {
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
if (-e "$prefixfile") {
open(PREFIX, "<:utf8", "$prefixfile");
while (<PREFIX>) {
my $item = $_;
chomp($item);
if (($item) && (substr($item,0,1) ne "#")) {
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
$PREFIX_REF->{$1} = 2;
} else {
$PREFIX_REF->{$item} = 1;
}
}
}
close(PREFIX);
}
}
|