|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long "GetOptions"; |
|
|
|
my $_CORPUS; |
|
my $_OUTPUT = "generation"; |
|
my $_GENERATION_FACTORS; |
|
|
|
die "specify options" unless &GetOptions('corpus=s' => \$_CORPUS, |
|
'output=s' => \$_OUTPUT, |
|
'generation-factors=s' => \$_GENERATION_FACTORS); |
|
|
|
|
|
die "Please use --corpus to specify the factored input corpus\n" unless $_CORPUS; |
|
|
|
if (! defined $_GENERATION_FACTORS) { |
|
die "Please use --generation-factors to set generation factors\n"; |
|
} |
|
|
|
my $___GENERATION_FACTORS = $_GENERATION_FACTORS || "0-0"; |
|
die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n") |
|
if $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/; |
|
|
|
print "output=$_OUTPUT.<factor-map>\n"; |
|
|
|
get_generation_factored(); |
|
print "Done\n"; |
|
exit 0; |
|
|
|
sub get_generation_factored { |
|
print STDERR "(8) learn generation model @ ".`date`; |
|
foreach my $f (split(/\+/,$___GENERATION_FACTORS)) { |
|
my $factor = $f; |
|
my ($factor_e_source,$factor_e) = split(/\-/,$factor); |
|
&get_generation($factor, $factor_e_source, $factor_e); |
|
} |
|
} |
|
|
|
|
|
sub get_generation { |
|
my ($factor, $factor_e_source, $factor_e) = @_; |
|
|
|
print STDERR "(8) [$factor] generate generation table @ ".`date`; |
|
my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH); |
|
|
|
my %INCLUDE_SOURCE; |
|
foreach my $factor (split(/,/,$factor_e_source)) { |
|
|
|
$INCLUDE_SOURCE{$factor} = 1; |
|
} |
|
my %INCLUDE; |
|
foreach my $factor (split(/,/,$factor_e)) { |
|
$INCLUDE{$factor} = 1; |
|
} |
|
|
|
my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET); |
|
open(E,$_CORPUS) or die "Can't read ".$_CORPUS; |
|
while(<E>) { |
|
chomp; |
|
foreach (split) { |
|
my @FACTOR = split(/\|/); |
|
|
|
my ($source,$target); |
|
my $first_factor = 1; |
|
foreach my $factor (split(/,/,$factor_e_source)) { |
|
$source .= "|" unless $first_factor; |
|
$first_factor = 0; |
|
$source .= $FACTOR[$factor]; |
|
} |
|
|
|
$first_factor = 1; |
|
foreach my $factor (split(/,/,$factor_e)) { |
|
$target .= "|" unless $first_factor; |
|
$first_factor = 0; |
|
$target .= $FACTOR[$factor]; |
|
} |
|
$GENERATION{$source}{$target}++; |
|
$GENERATION_TOTAL_SOURCE{$source}++; |
|
$GENERATION_TOTAL_TARGET{$target}++; |
|
} |
|
} |
|
close(E); |
|
|
|
open(GEN,">$_OUTPUT.$factor") or die "Can't write $_OUTPUT.$factor"; |
|
foreach my $source (keys %GENERATION) { |
|
foreach my $target (keys %{$GENERATION{$source}}) { |
|
printf GEN ("%s %s %.7f %.7f\n",$source,$target, |
|
$GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source}, |
|
$GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target}); |
|
} |
|
} |
|
close(GEN); |
|
safesystem("rm -f $_OUTPUT.$factor.gz") or die; |
|
safesystem("gzip $_OUTPUT.$factor") or die; |
|
} |
|
|
|
sub safesystem { |
|
print STDERR "Executing: @_\n"; |
|
system(@_); |
|
if ($? == -1) { |
|
print STDERR "Failed to execute: @_\n $!\n"; |
|
exit(1); |
|
} |
|
elsif ($? & 127) { |
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", |
|
($? & 127), ($? & 128) ? 'with' : 'without'; |
|
} |
|
else { |
|
my $exitcode = $? >> 8; |
|
print STDERR "Exit code: $exitcode\n" if $exitcode; |
|
return ! $exitcode; |
|
} |
|
} |
|
|
|
|