File size: 1,313 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

my ($lowercase,$cluster_file,$in,$out,$tmp) = @ARGV;

my $CLUSTER = &read_cluster_from_mkcls($cluster_file);

# is $lowercase a script?
if ($lowercase =~ /\//) {
  open(IN,"$lowercase < $in|") || die("ERROR: could not open input");
  $lowercase = 0;
}
else {
  open(IN,$in) || die("ERROR: could not open input");
}
binmode(IN, ":utf8");
open(OUT,">$out");
binmode(OUT, ":utf8");
while(<IN>) {
  chop;
  s/\s+/ /g;
  s/^ //;
  s/ $//;
  my $first = 1;
  foreach my $word (split) {
    # if lowercase is a flag
    if ($lowercase) {
      $word = lc($word);
    }
    my $cluster = defined($$CLUSTER{$word}) ? $$CLUSTER{$word} : "0";
    print OUT " " unless $first;
    print OUT $cluster;
    $first = 0;
  }
  print OUT "\n";
}
close(OUT);
close(IN);

sub read_cluster_from_mkcls {
  my ($file) = @_;
  my %CLUSTER;
  open(CLUSTER_FILE,$file) || die("ERROR: could not open cluster file '$file'");
  binmode(CLUSTER_FILE, ":utf8");
  while(<CLUSTER_FILE>) {
    chop;
    my ($word,$cluster) = split;
    $CLUSTER{$word} = $cluster;
  }
  close(CLUSTER_FILE);
  return \%CLUSTER;
}

sub add_cluster_to_string {
}