|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use strict; |
|
use warnings; |
|
use File::Basename; |
|
use File::Path; |
|
use POSIX; |
|
use List::Util qw( min sum ); |
|
|
|
my ($ini_file, $targetdir) = @ARGV; |
|
|
|
if (! defined $targetdir) { |
|
die "usage: reduce-topt-count.pl moses.ini targetdir\n" |
|
} |
|
|
|
my %ttables; |
|
my $ini_hdl = my_open($ini_file); |
|
my $outini_hdl = my_save("$targetdir/moses.ini"); |
|
|
|
my $section = ""; |
|
|
|
my %section_handlers = ( |
|
'ttable-file' => read_ttable_file(), |
|
'ttable-limit' => read_ttable_limit(), |
|
'weight-t' => read_weight_t() |
|
); |
|
|
|
|
|
my $timestamp = POSIX::strftime("%m/%d/%Y %H:%M:%S", localtime); |
|
print $outini_hdl <<"END"; |
|
|
|
|
|
|
|
END |
|
|
|
|
|
while (<$ini_hdl>) { |
|
chomp(my $line = $_); |
|
my $do_print = 1; |
|
if ($line =~ m/^\s*#/ || $line =~ m/^\s*$/) { |
|
|
|
} elsif ($line =~ m/^\[(.*)\]/) { |
|
$section = $1; |
|
} else { |
|
if (defined $section_handlers{$section}) { |
|
|
|
|
|
|
|
$do_print = 0; |
|
$section_handlers{$section}->($line, $outini_hdl); |
|
} |
|
} |
|
|
|
if ($do_print) { |
|
print $outini_hdl "$line\n"; |
|
} |
|
} |
|
close $outini_hdl; |
|
|
|
|
|
for my $ttable (keys %ttables) { |
|
filter_table($ttables{$ttable}); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
sub read_ttable_file |
|
{ |
|
my $ttable_id = 0; |
|
return sub { |
|
my ($line, $outhdl) = @_; |
|
if ($line !~ m/^(\d+) ([\d\,\-]+) ([\d\,\-]+) (\d+) (\S+)$/) { |
|
die "Format not recognized: $line"; |
|
} |
|
my ($type, $srcfacts, $tgtfacts, $numscores, $file) = ($1, $2, $3, $4, $5); |
|
if ($type != 0) { |
|
die "Cannot work with ttables of type $type"; |
|
} |
|
$ttables{$ttable_id} = { |
|
file => $file, |
|
scores => $numscores |
|
}; |
|
|
|
print $outhdl |
|
"$type $srcfacts $tgtfacts $numscores $targetdir/", basename($file), "\n"; |
|
$ttable_id++; |
|
} |
|
} |
|
|
|
sub read_ttable_limit |
|
{ |
|
my $ttable_id = 0; |
|
return sub { |
|
my ($line, $outhdl) = @_; |
|
$ttables{$ttable_id}->{limit} = $line; |
|
print $outhdl "$line\n"; |
|
$ttable_id++; |
|
} |
|
} |
|
|
|
sub read_weight_t |
|
{ |
|
my $weight_idx = 0; |
|
my $ttable_id = 0; |
|
return sub { |
|
my ($line, $outhdl) = @_; |
|
if ($ttables{$ttable_id}->{scores} == $weight_idx) { |
|
$weight_idx = 0; |
|
$ttable_id++; |
|
} |
|
push @{ $ttables{$ttable_id}->{weights} }, $line; |
|
print $outhdl "$line\n"; |
|
$weight_idx++; |
|
} |
|
} |
|
|
|
sub filter_table |
|
{ |
|
my $ttable = shift; |
|
my $in = my_open($ttable->{file}); |
|
my $out = my_save($targetdir . "/" . basename($ttable->{file})); |
|
my $limit = $ttable->{limit}; |
|
my @weights = @{ $ttable->{weights} }; |
|
|
|
print STDERR "Filtering ", $ttable->{file}, ", using limit $limit\n"; |
|
my $kept = 0; |
|
my $total = 0; |
|
|
|
my $src_phrase = ""; |
|
my @tgt_phrases; |
|
while (<$in>) { |
|
chomp(my $line = $_); |
|
$total++; |
|
print STDERR '.' if $total % 1000 == 0; |
|
my @cols = split / \|\|\| /, $line; |
|
if ($cols[0] ne $src_phrase) { |
|
my @sorted = sort { $b->{score} <=> $a->{score} } @tgt_phrases; |
|
for my $phrase (@sorted[0 .. min($#sorted, $limit - 1)]) { |
|
$kept++; |
|
print $out $phrase->{str}, "\n"; |
|
} |
|
$src_phrase = $cols[0]; |
|
@tgt_phrases = (); |
|
} |
|
my @scores = split ' ', $cols[2]; |
|
push @tgt_phrases, { |
|
str => $line, |
|
score => sum(map { $weights[$_] * log $scores[$_] } (0 .. $#weights)) |
|
}; |
|
} |
|
printf STDERR "Finished, kept %d%% of phrases\n", $kept / $total * 100; |
|
close $in; |
|
close $out; |
|
} |
|
|
|
sub my_open { |
|
my $f = shift; |
|
die "Not found: $f" if ! -e $f; |
|
|
|
my $opn; |
|
my $hdl; |
|
my $ft = `file $f`; |
|
|
|
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) { |
|
$opn = "zcat $f |"; |
|
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) { |
|
$opn = "bzcat $f |"; |
|
} else { |
|
$opn = "$f"; |
|
} |
|
open $hdl, $opn or die "Can't open '$opn': $!"; |
|
binmode $hdl, ":utf8"; |
|
return $hdl; |
|
} |
|
|
|
sub my_save { |
|
my $f = shift; |
|
if ($f eq "-") { |
|
binmode(STDOUT, ":utf8"); |
|
return *STDOUT; |
|
} |
|
|
|
my $opn; |
|
my $hdl; |
|
|
|
if ($f =~ /\.gz$/) { |
|
$opn = "| gzip -c > '$f'"; |
|
} elsif ($f =~ /\.bz2$/) { |
|
$opn = "| bzip2 > '$f'"; |
|
} else { |
|
$opn = ">$f"; |
|
} |
|
mkpath( dirname($f) ); |
|
open $hdl, $opn or die "Can't write to '$opn': $!"; |
|
binmode $hdl, ":utf8"; |
|
return $hdl; |
|
} |
|
|
|
|