File size: 1,313 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
my ($lowercase,$cluster_file,$in,$out,$tmp) = @ARGV;
my $CLUSTER = &read_cluster_from_mkcls($cluster_file);
# is $lowercase a script?
if ($lowercase =~ /\//) {
open(IN,"$lowercase < $in|") || die("ERROR: could not open input");
$lowercase = 0;
}
else {
open(IN,$in) || die("ERROR: could not open input");
}
binmode(IN, ":utf8");
open(OUT,">$out");
binmode(OUT, ":utf8");
while(<IN>) {
chop;
s/\s+/ /g;
s/^ //;
s/ $//;
my $first = 1;
foreach my $word (split) {
# if lowercase is a flag
if ($lowercase) {
$word = lc($word);
}
my $cluster = defined($$CLUSTER{$word}) ? $$CLUSTER{$word} : "0";
print OUT " " unless $first;
print OUT $cluster;
$first = 0;
}
print OUT "\n";
}
close(OUT);
close(IN);
sub read_cluster_from_mkcls {
my ($file) = @_;
my %CLUSTER;
open(CLUSTER_FILE,$file) || die("ERROR: could not open cluster file '$file'");
binmode(CLUSTER_FILE, ":utf8");
while(<CLUSTER_FILE>) {
chop;
my ($word,$cluster) = split;
$CLUSTER{$word} = $cluster;
}
close(CLUSTER_FILE);
return \%CLUSTER;
}
sub add_cluster_to_string {
}
|