use warnings; |
use strict; |
no warnings 'once'; |
use utf8; |
use Cwd ('abs_path'); |
use File::Spec::Functions; |
use File::Basename ('dirname'); |
use IPC::Run3; |
use Getopt::Long; |
use Test::More; |
GetOptions("detokenizer=s" => \(my $detokenizer), |
"results-dir=s"=> \(my $results_dir) |
) or exit 1; |
unless (defined $results_dir) { |
print STDERR "Usage: run-test-detokenizer.t --results-dir <RESULTS-DIRECTORY> [--detokenizer <DETOKENIZER-SCRIPT>]\n"; |
exit 1; |
} |
die "ERROR: Results directory ".$results_dir." doesn't exist or is not a writable directory. Dying" unless (-d $results_dir && -w $results_dir); |
$detokenizer = catfile(dirname(dirname(abs_path($0))), "scripts", "tokenizer", "detokenizer.perl") unless $detokenizer; |
die "ERROR: Detokenizer script ".$detokenizer." does not exist. Dying" unless -f $detokenizer; |
my @testCases = (); |
&addDetokenizerTest("TEST_ENGLISH_EASY", "en", |
<<'TOK' |
This sentence is really simple , so it should not be hard to detokenize . |
This one is no more difficult , but , hey , it is on a new line . |
, |
<<'EXP' |
This sentence is really simple, so it should not be hard to detokenize. |
This one is no more difficult, but, hey, it is on a new line. |
); |
&addDetokenizerTest("TEST_ENGLISH_DOUBLEQUOTES", "en", |
<<'TOK' |
This is a somewhat " less simple " test . |
, |
<<'EXP' |
This is a somewhat "less simple" test. |
); |
&addDetokenizerTest("TEST_FRENCH_EASY", "fr", |
<<'TOK' |
Voici une phrase simple . |
, |
<<'EXP' |
Voici une phrase simple. |
); |
&addDetokenizerTest("TEST_FRENCH_APOSTROPHE", "fr", |
<<'TOK' |
Moi , j' ai une apostrophe . |
, |
<<'EXP' |
Moi, j'ai une apostrophe. |
); |
<<'TOK' |
de musique rap issus de l' immigration |
, |
<<'EXP' |
de musique rap issus de l'immigration |
); |
&addDetokenizerTest("TEST_GERMAN_NONASCII", undef, |
<<'TOK' |
Ich hoffe , daß Sie schöne Ferien hatten . |
Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen |
, |
<<'EXP' |
Ich hoffe, daß Sie schöne Ferien hatten. |
Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen |
); |
&addDetokenizerTest("TEST_CHINESE_EASY", undef, |
<<'TOK' |
这 是 一个 简单 的的 汉语 句子 。 |
, |
<<'EXP' |
这是一个简单的的汉语句子。 |
); |
&addDetokenizerTest("TEST_JAPANESE_EASY", undef, |
<<'TOK' |
どう しょ う か な 。 |
どこ で 食べ たい 。 |
, |
<<'EXP' |
どうしょうかな。 |
どこで食べたい。 |
); |
plan tests => scalar(@testCases); |
foreach my $testCase (@testCases) { |
&runDetokenizerTest($testCase); |
} |
sub addDetokenizerTest { |
my ($testName, $language, $tokenizedText, $rightAnswer) = @_; |
my $testCase = new DetokenizerTestCase($testName, $language, $tokenizedText, $rightAnswer); |
push(@testCases, $testCase); |
return $testCase; |
} |
sub runDetokenizerTest { |
my ($testCase) = @_; |
my $testOutputDir = catfile($results_dir, $testCase->getName()); |
my $tokenizedFile = catfile($testOutputDir, "input.txt"); |
my $expectedFile = catfile($testOutputDir, "expected.txt"); |
unless (mkdir($testOutputDir)) { |
return fail($testCase->getName().": Failed to create output directory ".$testOutputDir." [".$!."]"); |
} |
open TOK, ">".$tokenizedFile; |
binmode TOK, ":utf8"; |
print TOK $testCase->getTokenizedText(); |
close TOK; |
open TRUTH, ">".$expectedFile; |
binmode TRUTH, ":utf8"; |
print TRUTH $testCase->getRightAnswer(); |
close TRUTH; |
&runTest($testCase->getName(), $testOutputDir, $tokenizedFile, sub { |
return defined($testCase->getLanguage()) ? [$detokenizer, "-l", $testCase->getLanguage()] : [$detokenizer]; |
}, sub { |
&verifyIdentical($testCase->getName(), $expectedFile, catfile($testOutputDir, "stdout.txt")) |
}, 1, $testCase->getFailureExplanation()); |
} |
sub runTest { |
my ($testName, $outputDir, $stdinFile, $buildCommandRoutineReference, $validationRoutineReference, $separateStdoutFromStderr, $failureExplanation) = @_; |
my ($stdoutFile, $stderrFile); |
if ($separateStdoutFromStderr) { |
$stdoutFile = catfile($outputDir, "stdout.txt"); |
$stderrFile = catfile($outputDir, "stderr.txt"); |
} else { |
$stdoutFile = catfile($outputDir, "stdout-and-stderr.txt"); |
$stderrFile = $stdoutFile; |
} |
my $commandRef = $buildCommandRoutineReference->(); |
my $exitStatus = &runVerbosely($commandRef, $stdinFile, $stdoutFile, $stderrFile); |
return fail($testName.": command exited with status ".$exitStatus) unless $exitStatus == 0; |
if (defined $failureExplanation) { |
TODO: { |
local $TODO = $failureExplanation; |
$validationRoutineReference->(); |
} |
} else { |
$validationRoutineReference->(); |
} |
} |
sub runVerbosely { |
my ($commandRef, $stdinFile, $stdoutFile, $stderrFile) = @_; |
my @command = @{$commandRef}; |
note("Executing command:\n @command\n"); |
note("standard input coming from: ".$stdinFile) if defined $stdinFile; |
note("standard output going to: ".$stdoutFile) if defined $stdoutFile; |
note("standard error going to: ".$stderrFile) if defined $stderrFile; |
run3($commandRef, $stdinFile, $stdoutFile, $stderrFile); |
return $?; |
} |
sub verifyIdentical { |
my ($testName, $referenceFile, $outputFile) = @_; |
open(REF, $referenceFile) or return fail($testName.": Can't open reference file ".$referenceFile." [".$!."]."); |
open(OUT, $outputFile) or return fail($testName.": Can't open output file ".$outputFile." [".$!."]."); |
my @referenceFileAsArray = <REF>; |
my @outputFileAsArray = <OUT>; |
close(REF); |
close(OUT); |
is_deeply(\@outputFileAsArray, \@referenceFileAsArray, $testName.": Output file ".$outputFile." matches reference file ".$referenceFile."."); |
} |
package DetokenizerTestCase; |
sub new { |
my $class = shift; |
my $self = { |
_name => shift, |
_language => shift, |
_tokenizedText => shift, |
_rightAnswer => shift, |
_failureExplanation => undef |
}; |
bless $self, $class; |
} |
sub getName { |
my ($self) = @_; |
return $self->{_name}; |
} |
sub getLanguage { |
my ($self) = @_; |
return $self->{_language}; |
} |
sub getTokenizedText { |
my ($self) = @_; |
return $self->{_tokenizedText}; |
} |
sub getRightAnswer { |
my ($self) = @_; |
return $self->{_rightAnswer}; |
} |
sub setExpectedToFail { |
my ($self, $failureExplanation) = @_; |
$self->{_failureExplanation} = $failureExplanation || "This test is expected to fail."; |
} |
sub getFailureExplanation { |
my ($self) = @_; |
return $self->{_failureExplanation}; |
} |