|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long qw(:config pass_through no_ignore_case permute); |
|
|
|
my ($BIN,$IN,$OUT,$MAX_LINES,$SETTINGS,$REVERSE,$SAVE_MODEL,$TMP); |
|
|
|
GetOptions('bin=s' => \$BIN, |
|
'i=s' => \$IN, |
|
'o=s' => \$OUT, |
|
'max-lines=i' => \$MAX_LINES, |
|
'settings=s' => \$SETTINGS, |
|
'save-model=s' => \$SAVE_MODEL, |
|
'r' => \$REVERSE, |
|
'tmp=s' => \$TMP, |
|
) or exit(1); |
|
|
|
die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR [-save-model MODEL] -o ALIGNMENTS") |
|
unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) |
|
&& defined($MAX_LINES) && defined($OUT) |
|
&& $MAX_LINES > 0; |
|
die("ERROR - input file does not exist: $IN") unless -e $IN; |
|
die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN; |
|
|
|
$SAVE_MODEL = defined($SAVE_MODEL) && $SAVE_MODEL && $SAVE_MODEL ne 'no'; |
|
chomp(my $line_count = `cat $IN | wc -l`); |
|
|
|
|
|
if ($MAX_LINES > $line_count) { |
|
my $cmd = "$BIN -i $IN $SETTINGS >$OUT"; |
|
$cmd .= " -r" if defined($REVERSE); |
|
$cmd .= " -p $OUT.parameters 2> $OUT.log" if $SAVE_MODEL; |
|
safesystem($cmd) or die; |
|
exit(0); |
|
} |
|
|
|
my $cmd = "mkdir -p $TMP"; |
|
safesystem($cmd) or die; |
|
|
|
|
|
$cmd = "split -a 2 -l $MAX_LINES $IN $TMP/prepared-"; |
|
safesystem($cmd) or die; |
|
|
|
|
|
my @INPUT_FILES = `ls $TMP/prepared-*`; |
|
chop(@INPUT_FILES); |
|
foreach my $input_file (@INPUT_FILES) { |
|
|
|
die("ERROR") unless $input_file =~ /prepared-(..)$/; |
|
my $output_file = "$TMP/aligned-$1"; |
|
|
|
|
|
my $cmd = "$BIN -i $input_file $SETTINGS"; |
|
$cmd .= " -r" if defined($REVERSE); |
|
$cmd .= " -p $output_file.parameters 2> $output_file.log" if $SAVE_MODEL; |
|
$cmd .= " >$output_file"; |
|
safesystem($cmd) or die; |
|
die("ERROR: no output produced from command $cmd") unless -e $output_file; |
|
|
|
|
|
chomp(my $input_line_count = `cat $input_file | wc -l`); |
|
chomp(my $output_line_count = `cat $output_file | wc -l`); |
|
die("ERROR: mismatched number of lines in part $1\n\t$input_line_count\t$input_file\n\t$output_line_count\t$output_file\n") unless $input_line_count == $output_line_count; |
|
} |
|
|
|
|
|
$cmd = "cat $TMP/aligned-?? > $OUT"; |
|
safesystem($cmd) or die; |
|
|
|
|
|
&join_model(scalar @INPUT_FILES) if $SAVE_MODEL; |
|
&join_log(scalar @INPUT_FILES) if $SAVE_MODEL; |
|
|
|
$cmd = "rm $TMP/* ; rmdir $TMP"; |
|
safesystem($cmd); |
|
|
|
sub join_model { |
|
my ($count) = @_; |
|
open(CONCAT,"cat $TMP/aligned-*.parameters | LC_ALL=C sort -T $TMP -S 10%|"); |
|
open(JOINED,">$OUT.parameters"); |
|
my ($last_f,$last_e,$f,$e,$score,$merged_score); |
|
while(<CONCAT>) { |
|
($f,$e,$score) = split; |
|
if (!defined($last_f) || $f ne $last_f || $e ne $last_e) { |
|
printf JOINED "%s %s %f\n",$last_f,$last_e,log($merged_score) if defined($last_f); |
|
$last_f = $f; |
|
$last_e = $e; |
|
$merged_score = 0; |
|
} |
|
$merged_score += exp($score)/$count; |
|
} |
|
printf JOINED "%s %s %f\n",$f,$e,log($merged_score); |
|
close(CONCAT); |
|
close(JOINED); |
|
} |
|
|
|
sub merge_entry { |
|
my ($count,$f,$e,@SCORE) = @_; |
|
my $score = 0; |
|
foreach (@SCORE) { |
|
$score += exp($_)/$count; |
|
} |
|
$score = log($score); |
|
print JOINED "$f $e $score\n"; |
|
} |
|
|
|
sub join_log { |
|
my ($count) = @_; |
|
open(CONCAT,"cat $TMP/aligned-*.log |"); |
|
my ($length,$tension,$tension_count) = (0,0,0); |
|
while(<CONCAT>) { |
|
$length += $1 if /expected target length = source length \* ([\d\.]+)/; |
|
$tension += $1 if /final tension: ([\d\.]+)/ and (++$tension_count % 3 == 0); |
|
} |
|
close(CONCAT); |
|
$length /= $count; |
|
$tension /= $count; |
|
open(JOINED,">$OUT.log"); |
|
print JOINED "expected target length = source length * $length\n"; |
|
print JOINED " final tension: $tension\n"; |
|
close(JOINED); |
|
} |
|
|
|
sub safesystem { |
|
print STDERR "Executing: @_\n"; |
|
system(@_); |
|
if ($? == -1) { |
|
print STDERR "Failed to execute: @_\n $!\n"; |
|
exit(1); |
|
} |
|
elsif ($? & 127) { |
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", |
|
($? & 127), ($? & 128) ? 'with' : 'without'; |
|
exit 1; |
|
} |
|
else { |
|
my $exitcode = $? >> 8; |
|
print STDERR "Exit code: $exitcode\n" if $exitcode; |
|
return ! $exitcode; |
|
} |
|
} |
|
|
|
|