|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long; |
|
use CGI; |
|
use JSON; |
|
use HTTP::Request; |
|
use HTTP::Headers; |
|
use LWP::UserAgent; |
|
use Data::Dumper; |
|
|
|
my $shucks = 0; |
|
sub catch_zap { |
|
my $signame = shift; |
|
$shucks++; |
|
print STDERR "Somebody sent me a SIG$signame, will exit.\n"; |
|
} |
|
$SIG{INT} = \&catch_zap; |
|
|
|
|
|
my $srclang = "en"; |
|
my $tgtlang = "cs"; |
|
my $batchlimit = 560; |
|
my $skip_long_sentences = 0; |
|
|
|
my $sleep = 5; |
|
my $inpad = " My_SpE. "; |
|
my $outpad = " ?My_SpE. ?"; |
|
my $verbose = 0; |
|
my $require_fullstop_to_join = 1; |
|
|
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
binmode(STDERR, ":utf8"); |
|
|
|
GetOptions( |
|
"srclang=s" => \$srclang, |
|
"tgtlang=s" => \$tgtlang, |
|
"sleep=s" => \$sleep, |
|
"inpad=s" => \$inpad, |
|
"outpad=s" => \$outpad, |
|
"skip-long-sentences" => \$skip_long_sentences, |
|
"require-fullstop-to-join" => \$require_fullstop_to_join, |
|
) or exit 1; |
|
|
|
my @fnames = @ARGV; |
|
|
|
sub microsleep { |
|
select(undef,undef,undef,$_[0]); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
if (scalar @fnames == 0) { |
|
print STDERR "Suggested usage:\n"; |
|
print STDERR " nohup ./get_many_translations.pl infile1 outfile1\n"; |
|
print STDERR " [infile2.gz outfile2.gz ...] > log &\n"; |
|
print STDERR "Use ctrl-C to interrupt at any time.\n"; |
|
print STDERR "Restart with the same input and output files to continue.\n"; |
|
exit 1; |
|
} |
|
|
|
my $skipped = 0; |
|
while (0 < scalar @fnames) { |
|
last if $shucks; |
|
my $infile = shift @fnames; |
|
my $outfile = shift @fnames; |
|
collect_translations($infile, $outfile); |
|
|
|
if ($outfile =~ /\.(gz|bz2)$/) { |
|
print STDERR "Recompressing $outfile\n"; |
|
*INF = my_open($outfile); |
|
my @lines = <INF>; |
|
close INF; |
|
rename $outfile, $outfile."~tmpbkup" |
|
or die "Failed to backup $outfile before finalizing."; |
|
*OUTF = my_append($outfile); |
|
print OUTF $_ while ($_ = shift @lines); |
|
close OUTF; |
|
unlink $outfile."~tmpbkup"; |
|
} |
|
} |
|
print STDERR "Done. Skipped $skipped sentences.\n"; |
|
|
|
sub collect_translations { |
|
my $infile = shift; |
|
my $outfile = shift; |
|
|
|
while (1) { |
|
last if $shucks; |
|
|
|
my $gotlines = wcl($outfile); |
|
print STDERR "$outfile contains $gotlines lines already, extending.\n"; |
|
|
|
my $nr = 0; |
|
my @inlines = (); |
|
my $droplast = 0; |
|
*INF = my_open($infile); |
|
while (<INF>) { |
|
$nr++; |
|
if (length(join($inpad, @inlines)) > $batchlimit) { |
|
|
|
$droplast = 1; |
|
last; |
|
} |
|
|
|
if ($nr > $gotlines) { |
|
chomp; |
|
push @inlines, $_; |
|
} |
|
|
|
last if 0 < scalar(@inlines) |
|
&& $inlines[-1] !~ /\.\s*$/ && $require_fullstop_to_join; |
|
} |
|
if (length(join($inpad, @inlines)) > $batchlimit) { |
|
|
|
$droplast = 1; |
|
} |
|
close INF; |
|
if (0 == scalar @inlines) { |
|
print STDERR "No more input lines in $infile.\n"; |
|
return; |
|
} |
|
if ($droplast) { |
|
my $skippedtext = pop @inlines; |
|
|
|
|
|
if (0==scalar @inlines) { |
|
$nr--; |
|
if ($skip_long_sentences) { |
|
print STDERR "$infile:$nr:SKIPPING too long sentence: $skippedtext\n"; |
|
$skipped++; |
|
} else { |
|
die "$infile:$nr:Line exceeds Google batch limit!"; |
|
} |
|
} |
|
} |
|
|
|
my $outlines; |
|
|
|
if (0 == scalar @inlines) { |
|
|
|
$outlines = [""]; |
|
} else { |
|
$outlines = translate_batch(\@inlines); |
|
last if !defined $outlines; |
|
} |
|
|
|
*OUTF = my_append($outfile); |
|
foreach my $outline (@$outlines) { |
|
print OUTF $outline."\n"; |
|
} |
|
close OUTF; |
|
} |
|
} |
|
|
|
sub wcl { |
|
my $f = shift; |
|
my $gotlines = 0; |
|
if (-e $f) { |
|
*PEEKF = my_open($f); |
|
$gotlines ++ while (<PEEKF>); |
|
close PEEKF; |
|
} |
|
return $gotlines; |
|
} |
|
|
|
sub translate_batch { |
|
my $inlines = shift; |
|
my @outlines = (); |
|
|
|
my $responsestr = single_query(join($inpad, @$inlines)); |
|
my $response = from_json($responsestr, {utf8=>1}); |
|
|
|
my $translated_text = $response->{"responseData"}->{"translatedText"}; |
|
|
|
|
|
my $finblanks = 0; |
|
while ($translated_text =~ /$outpad$/) { |
|
$finblanks ++; |
|
$translated_text =~ s/$outpad$//; |
|
} |
|
|
|
my @outlines = split /$outpad/, $translated_text; |
|
push @outlines, ( map {""} (1..$finblanks) ); |
|
|
|
if (scalar @$inlines != scalar @outlines) { |
|
print STDERR "Input lines:\n"; |
|
map {print STDERR $_."\n"} @$inlines; |
|
print STDERR "\nOutput text:\n$translated_text\n\n"; |
|
print STDERR "Details:\n".Dumper($response)."\n\n"; |
|
print STDERR "Mismatched number of sentences! Expected ".(scalar @$inlines) |
|
." got ".(scalar @outlines)."\n"; |
|
return undef; |
|
} |
|
|
|
|
|
@outlines = map { s/"/"/g; s/'/'/g; |
|
s/</</g; s/>/>/g; |
|
s/&/&/g; |
|
$_ } @outlines; |
|
|
|
return \@outlines; |
|
} |
|
|
|
sub single_query { |
|
my $intext = shift; |
|
my $querytext = CGI::escape($intext); |
|
print STDERR "Req: $querytext\n" if $verbose; |
|
|
|
|
|
my $headers = HTTP::Headers->new; |
|
$headers->referer('http://ufal.mff.cuni.cz/~bojar/translate-czeng-by-google.html'); |
|
my $request = HTTP::Request->new("GET", |
|
"http://ajax.googleapis.com/ajax/services/language/translate?v=1.0&q=$querytext&langpair=$srclang%7C$tgtlang", |
|
$headers); |
|
my $ua = LWP::UserAgent->new; |
|
print STDERR "Requesting translation...\n" if $verbose; |
|
microsleep($sleep); |
|
my $response = $ua->request($request); |
|
if ($response->is_success) { |
|
my $text = $response->content(); |
|
return $text; |
|
} else { |
|
print STDERR "Req: $querytext\n"; |
|
die "Failed to get translations: ".$response->status_line; |
|
} |
|
} |
|
|
|
sub my_open { |
|
my $f = shift; |
|
die "Not found: $f" if ! -e $f; |
|
|
|
my $opn; |
|
my $hdl; |
|
my $ft = `file $f`; |
|
|
|
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) { |
|
$opn = "zcat $f |"; |
|
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) { |
|
$opn = "bzcat $f |"; |
|
} else { |
|
$opn = "$f"; |
|
} |
|
open $hdl, $opn or die "Can't open '$opn': $!"; |
|
binmode $hdl, ":utf8"; |
|
return $hdl; |
|
} |
|
|
|
sub my_append { |
|
my $f = shift; |
|
|
|
my $opn; |
|
my $hdl; |
|
|
|
if ($f =~ /\.gz$/) { |
|
$opn = "| gzip -c >> $f"; |
|
} elsif ($f =~ /\.bz2$/) { |
|
$opn = "| bzip2 >> $f"; |
|
} else { |
|
$opn = ">> $f"; |
|
} |
|
open $hdl, $opn or die "Can't append '$opn': $!"; |
|
binmode $hdl, ":utf8"; |
|
return $hdl; |
|
} |
|
|