|
|
|
|
|
|
|
|
|
|
|
|
|
$|++; |
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
|
|
use FindBin qw($Bin); |
|
use strict; |
|
|
|
|
|
my $mydir = "$Bin/nonbreaking_prefixes"; |
|
|
|
my %NONBREAKING_PREFIX = (); |
|
my $language = "en"; |
|
my $QUIET = 0; |
|
my $HELP = 0; |
|
|
|
|
|
|
|
while (@ARGV) { |
|
$_ = shift; |
|
/^-l$/ && ($language = shift, next); |
|
/^-q$/ && ($QUIET = 1, next); |
|
/^-h$/ && ($HELP = 1, next); |
|
} |
|
|
|
if ($HELP) { |
|
print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n"; |
|
exit; |
|
} |
|
if (!$QUIET) { |
|
print STDERR "Tokenizer v3\n"; |
|
print STDERR "Language: $language\n"; |
|
} |
|
|
|
load_prefixes($language,\%NONBREAKING_PREFIX); |
|
|
|
if (scalar(%NONBREAKING_PREFIX) eq 0){ |
|
print STDERR "Warning: No known abbreviations for language '$language'\n"; |
|
} |
|
|
|
while(<STDIN>) { |
|
if (/^<.+>$/ || /^\s*$/) { |
|
|
|
print $_; |
|
} |
|
else { |
|
print &tokenize($_); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
sub tokenize { |
|
my($text) = @_; |
|
chomp($text); |
|
$text = " $text "; |
|
|
|
|
|
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; |
|
|
|
|
|
$text =~ s/\.([\.]+)/ DOTMULTI$1/g; |
|
while($text =~ /DOTMULTI\./) { |
|
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; |
|
$text =~ s/DOTMULTI\./DOTDOTMULTI/g; |
|
} |
|
|
|
|
|
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; |
|
|
|
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; |
|
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; |
|
|
|
|
|
$text =~ s/\`/\'/g; |
|
|
|
|
|
$text =~ s/\'\'/ \" /g; |
|
|
|
if ($language eq "en") { |
|
|
|
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; |
|
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; |
|
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; |
|
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; |
|
|
|
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; |
|
} elsif (($language eq "fr") or ($language eq "it")) { |
|
|
|
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; |
|
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; |
|
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; |
|
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; |
|
} else { |
|
$text =~ s/\'/ \' /g; |
|
} |
|
|
|
|
|
my @words = split(/\s/,$text); |
|
$text = ""; |
|
for (my $i=0;$i<(scalar(@words));$i++) { |
|
my $word = $words[$i]; |
|
if ( $word =~ /^(\S+)\.$/) { |
|
my $pre = $1; |
|
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) { |
|
|
|
} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) { |
|
|
|
} else { |
|
$word = $pre." ."; |
|
} |
|
} |
|
$text .= $word." "; |
|
} |
|
|
|
|
|
$text =~ s/ +/ /g; |
|
$text =~ s/^ //g; |
|
$text =~ s/ $//g; |
|
|
|
|
|
while($text =~ /DOTDOTMULTI/) { |
|
$text =~ s/DOTDOTMULTI/DOTMULTI./g; |
|
} |
|
$text =~ s/DOTMULTI/./g; |
|
|
|
|
|
$text .= "\n" unless $text =~ /\n$/; |
|
|
|
return $text; |
|
} |
|
|
|
sub load_prefixes { |
|
my ($language, $PREFIX_REF) = @_; |
|
|
|
my $prefixfile = "$mydir/nonbreaking_prefix.$language"; |
|
|
|
|
|
if (!(-e $prefixfile)) { |
|
$prefixfile = "$mydir/nonbreaking_prefix.en"; |
|
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; |
|
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); |
|
} |
|
|
|
if (-e "$prefixfile") { |
|
open(PREFIX, "<:utf8", "$prefixfile"); |
|
while (<PREFIX>) { |
|
my $item = $_; |
|
chomp($item); |
|
if (($item) && (substr($item,0,1) ne "#")) { |
|
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) { |
|
$PREFIX_REF->{$1} = 2; |
|
} else { |
|
$PREFIX_REF->{$item} = 1; |
|
} |
|
} |
|
} |
|
close(PREFIX); |
|
} |
|
|
|
} |
|
|
|
|