File size: 4,559 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
#######################
# Revision history
#
# 28 Apr 2015 first version
use warnings;
use strict;
use Getopt::Long qw(:config pass_through no_ignore_case permute);
my ($BIN,$IN,$OUT,$MAX_LINES,$SETTINGS,$REVERSE,$SAVE_MODEL,$TMP);
GetOptions('bin=s' => \$BIN,
'i=s' => \$IN,
'o=s' => \$OUT,
'max-lines=i' => \$MAX_LINES,
'settings=s' => \$SETTINGS,
'save-model=s' => \$SAVE_MODEL,
'r' => \$REVERSE,
'tmp=s' => \$TMP,
) or exit(1);
die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR [-save-model MODEL] -o ALIGNMENTS")
unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP)
&& defined($MAX_LINES) && defined($OUT)
&& $MAX_LINES > 0;
die("ERROR - input file does not exist: $IN") unless -e $IN;
die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN;
$SAVE_MODEL = defined($SAVE_MODEL) && $SAVE_MODEL && $SAVE_MODEL ne 'no';
chomp(my $line_count = `cat $IN | wc -l`);
# not more than maximal number of lines -> just run it regulary
if ($MAX_LINES > $line_count) {
my $cmd = "$BIN -i $IN $SETTINGS >$OUT";
$cmd .= " -r" if defined($REVERSE);
$cmd .= " -p $OUT.parameters 2> $OUT.log" if $SAVE_MODEL;
safesystem($cmd) or die;
exit(0);
}
my $cmd = "mkdir -p $TMP";
safesystem($cmd) or die;
# split input
$cmd = "split -a 2 -l $MAX_LINES $IN $TMP/prepared-";
safesystem($cmd) or die;
# process
my @INPUT_FILES = `ls $TMP/prepared-*`;
chop(@INPUT_FILES);
foreach my $input_file (@INPUT_FILES) {
# create output file name
die("ERROR") unless $input_file =~ /prepared-(..)$/;
my $output_file = "$TMP/aligned-$1";
# process part
my $cmd = "$BIN -i $input_file $SETTINGS";
$cmd .= " -r" if defined($REVERSE);
$cmd .= " -p $output_file.parameters 2> $output_file.log" if $SAVE_MODEL;
$cmd .= " >$output_file";
safesystem($cmd) or die;
die("ERROR: no output produced from command $cmd") unless -e $output_file;
# check line count
chomp(my $input_line_count = `cat $input_file | wc -l`);
chomp(my $output_line_count = `cat $output_file | wc -l`);
die("ERROR: mismatched number of lines in part $1\n\t$input_line_count\t$input_file\n\t$output_line_count\t$output_file\n") unless $input_line_count == $output_line_count;
}
# join output
$cmd = "cat $TMP/aligned-?? > $OUT";
safesystem($cmd) or die;
# join model
&join_model(scalar @INPUT_FILES) if $SAVE_MODEL;
&join_log(scalar @INPUT_FILES) if $SAVE_MODEL;
$cmd = "rm $TMP/* ; rmdir $TMP";
safesystem($cmd);
sub join_model {
my ($count) = @_;
open(CONCAT,"cat $TMP/aligned-*.parameters | LC_ALL=C sort -T $TMP -S 10%|");
open(JOINED,">$OUT.parameters");
my ($last_f,$last_e,$f,$e,$score,$merged_score);
while(<CONCAT>) {
($f,$e,$score) = split;
if (!defined($last_f) || $f ne $last_f || $e ne $last_e) {
printf JOINED "%s %s %f\n",$last_f,$last_e,log($merged_score) if defined($last_f);
$last_f = $f;
$last_e = $e;
$merged_score = 0;
}
$merged_score += exp($score)/$count;
}
printf JOINED "%s %s %f\n",$f,$e,log($merged_score);
close(CONCAT);
close(JOINED);
}
sub merge_entry {
my ($count,$f,$e,@SCORE) = @_;
my $score = 0;
foreach (@SCORE) {
$score += exp($_)/$count;
}
$score = log($score);
print JOINED "$f $e $score\n";
}
sub join_log {
my ($count) = @_;
open(CONCAT,"cat $TMP/aligned-*.log |");
my ($length,$tension,$tension_count) = (0,0,0);
while(<CONCAT>) {
$length += $1 if /expected target length = source length \* ([\d\.]+)/;
$tension += $1 if /final tension: ([\d\.]+)/ and (++$tension_count % 3 == 0);
}
close(CONCAT);
$length /= $count;
$tension /= $count;
open(JOINED,">$OUT.log");
print JOINED "expected target length = source length * $length\n";
print JOINED " final tension: $tension\n";
close(JOINED);
}
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit 1;
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
|