|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use IPC::Open3; |
|
use File::Temp qw/tempdir/; |
|
use File::Path qw/rmtree/; |
|
use Getopt::Long "GetOptions"; |
|
use Symbol; |
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
binmode(STDERR, ":utf8"); |
|
|
|
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64"; |
|
my $TEMPDIR = "/tmp"; |
|
my ($TUNING,$LM,$NAME,$GROUP,$WEIGHTS,$CONTINUE); |
|
|
|
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]") |
|
unless &GetOptions('tuning=s' => => \$TUNING, |
|
'name=s' => \$NAME, |
|
'srilm=s' => \$SRILM, |
|
'tempdir=s' => \$TEMPDIR, |
|
'continue' => \$CONTINUE, |
|
'group=s' => \$GROUP, |
|
'weights=s' => \$WEIGHTS, |
|
'lm=s' => \$LM); |
|
|
|
|
|
die("ERROR: please specify output language model name --name") unless defined($NAME); |
|
die("ERROR: please specify tuning set with --tuning") unless defined($TUNING); |
|
die("ERROR: please specify language models with --lm") unless defined($LM); |
|
die("ERROR: can't read $TUNING") unless -e $TUNING; |
|
die("ERROR: did not find srilm dir") unless -e $SRILM; |
|
die("ERROR: cannot run ngram") unless -x $SRILM."/ngram"; |
|
|
|
my @LM = split(/,/,$LM); |
|
my @WEIGHT; |
|
@WEIGHT = split(/,/,$WEIGHTS) if defined($WEIGHTS); |
|
die("ERROR: different number of weights and language models: ".scalar(@WEIGHT)." vs. ".scalar(@LM)) |
|
if defined($WEIGHTS) && scalar(@WEIGHT) != scalar(@LM); |
|
|
|
|
|
my $order = 0; |
|
foreach my $lm (@LM) { |
|
my $lm_order; |
|
$lm .= ".gz" if (! -e $lm && -e "$lm.gz"); |
|
if ($lm =~ /gz$/) { |
|
open(LM,"zcat $lm|") || die("ERROR: could not find language model file '$lm'"); |
|
} |
|
else { |
|
open(LM,$lm) || die("ERROR: could not find language model file '$lm'"); |
|
} |
|
while(<LM>) { |
|
$lm_order = $1 if /ngram\s+(\d+)/; |
|
last if /1-grams/; |
|
} |
|
close(LM); |
|
$order = $lm_order if $order == 0; |
|
die("ERROR: language models have different order") if $order != $lm_order; |
|
} |
|
print STDERR "language models have order $order.\n"; |
|
|
|
|
|
if (!defined($GROUP) && scalar(@LM) > 10) { |
|
print STDERR "more than 10, automatically grouping language models.\n"; |
|
my $num_groups = int(scalar(@LM)/10 + 0.99); |
|
my $size_groups = int(scalar(@LM)/$num_groups + 0.99); |
|
|
|
$GROUP = ""; |
|
for(my $i=0;$i<$num_groups;$i++) { |
|
$GROUP .= " " unless $i==0; |
|
for(my $j=0;$j<$size_groups;$j++) { |
|
my $lm_i = $i*$size_groups+$j; |
|
next if $lm_i >= scalar(@LM); |
|
$GROUP .= "," unless $j==0; |
|
$GROUP .= $lm_i; |
|
} |
|
} |
|
print STDERR "groups: $GROUP\n"; |
|
} |
|
|
|
|
|
if (!defined($GROUP)) { |
|
&interpolate($NAME,\@WEIGHT,@LM); |
|
exit; |
|
} |
|
|
|
|
|
my %ALREADY; |
|
my $g = 0; |
|
my @SUB_NAME; |
|
foreach my $subgroup (split(/ /,$GROUP)) { |
|
my @SUB_LM; |
|
foreach my $lm_i (split(/,/,$subgroup)) { |
|
die("ERROR: LM id $lm_i in group definition out of range") if $lm_i >= scalar(@LM); |
|
push @SUB_LM,$LM[$lm_i]; |
|
$ALREADY{$lm_i} = 1; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
my $name = $NAME.".group-".chr(97+($g++)); |
|
push @SUB_NAME,$name; |
|
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n"; |
|
&interpolate($name, undef, @SUB_LM) unless $CONTINUE && -e $name; |
|
} |
|
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) { |
|
next if defined($ALREADY{$lm_i}); |
|
push @SUB_NAME, $LM[$lm_i]; |
|
} |
|
print STDERR "\n=== BUILDING FINAL LM ===\n\n"; |
|
&interpolate($NAME, undef, @SUB_NAME); |
|
|
|
|
|
sub interpolate { |
|
my ($name,$WEIGHT,@LM) = @_; |
|
|
|
die("cannot interpolate more than 10 language models at once: ",join(",",@LM)) |
|
if scalar(@LM) > 10; |
|
|
|
my $tmp = tempdir(DIR=>$TEMPDIR); |
|
my @LAMBDA; |
|
|
|
|
|
if (defined($WEIGHT) && scalar(@$WEIGHT) == scalar(@LM)) { |
|
@LAMBDA = @$WEIGHT; |
|
} |
|
|
|
else { |
|
|
|
my $i = 0; |
|
foreach my $lm (@LM) { |
|
print STDERR "compute perplexity for $lm\n"; |
|
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n"; |
|
print STDERR `tail -n 2 $tmp/iplm.$$.$i`; |
|
$i++; |
|
} |
|
|
|
|
|
print STDERR "computing lambdas...\n"; |
|
my $cmd = "$SRILM/compute-best-mix"; |
|
for(my $i=0;$i<scalar(@LM);$i++) { |
|
$cmd .= " $tmp/iplm.$$.$i"; |
|
} |
|
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd); |
|
die "Failed to mix models: $mixerr" if $mixexitcode != 0; |
|
my $mix = $mixout; |
|
`rm $tmp/iplm.$$.*`; |
|
$mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix"); |
|
@LAMBDA = split(/ /,$1); |
|
} |
|
|
|
|
|
print STDERR "creating new language model...\n"; |
|
my $i = 0; |
|
my $cmd = "$SRILM/ngram -unk -order $order -write-lm $name"; |
|
foreach my $lm (@LM) { |
|
$cmd .= " -lm " if $i==0; |
|
$cmd .= " -mix-lm " if $i==1; |
|
$cmd .= " -mix-lm$i " if $i>1; |
|
$cmd .= $lm; |
|
$cmd .= " -lambda " if $i==0; |
|
$cmd .= " -mix-lambda$i " if $i>1; |
|
$cmd .= $LAMBDA[$i] if $i!=1; |
|
$i++; |
|
} |
|
safesystem($cmd) or die "Failed."; |
|
|
|
rmtree($tmp); |
|
print STDERR "done.\n"; |
|
} |
|
|
|
sub safesystem { |
|
print STDERR "Executing: @_\n"; |
|
system(@_); |
|
if ($? == -1) { |
|
print STDERR "Failed to execute: @_\n $!\n"; |
|
exit(1); |
|
} |
|
elsif ($? & 127) { |
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", |
|
($? & 127), ($? & 128) ? 'with' : 'without'; |
|
exit(1); |
|
} |
|
else { |
|
my $exitcode = $? >> 8; |
|
print STDERR "Exit code: $exitcode\n" if $exitcode; |
|
return ! $exitcode; |
|
} |
|
} |
|
|
|
sub saferun3 { |
|
print STDERR "Executing: @_\n"; |
|
my $wtr = gensym(); |
|
my $rdr = gensym(); |
|
my $err = gensym(); |
|
my $pid = open3($wtr, $rdr, $err, @_); |
|
close($wtr); |
|
my $gotout = ""; |
|
$gotout .= $_ while (<$rdr>); |
|
close $rdr; |
|
my $goterr = ""; |
|
if (defined $err) { |
|
$goterr .= $_ while (<$err>); |
|
close $err; |
|
} |
|
waitpid($pid, 0); |
|
if ($? == -1) { |
|
print STDERR "Failed to execute: @_\n $!\n"; |
|
exit(1); |
|
} |
|
elsif ($? & 127) { |
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", |
|
($? & 127), ($? & 128) ? 'with' : 'without'; |
|
exit(1); |
|
} |
|
else { |
|
my $exitcode = $? >> 8; |
|
print STDERR "Exit code: $exitcode\n" if $exitcode; |
|
return ( $gotout, $goterr, $exitcode ); |
|
} |
|
} |
|
|