|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
|
|
no warnings 'once'; |
|
use utf8; |
|
|
|
use Cwd ('abs_path'); |
|
use File::Spec::Functions; |
|
use File::Basename ('dirname'); |
|
use IPC::Run3; |
|
use Getopt::Long; |
|
use Test::More; |
|
|
|
GetOptions("detokenizer=s" => \(my $detokenizer), |
|
"results-dir=s"=> \(my $results_dir) |
|
) or exit 1; |
|
|
|
unless (defined $results_dir) { |
|
print STDERR "Usage: run-test-detokenizer.t --results-dir <RESULTS-DIRECTORY> [--detokenizer <DETOKENIZER-SCRIPT>]\n"; |
|
exit 1; |
|
} |
|
|
|
die "ERROR: Results directory ".$results_dir." doesn't exist or is not a writable directory. Dying" unless (-d $results_dir && -w $results_dir); |
|
|
|
$detokenizer = catfile(dirname(dirname(abs_path($0))), "scripts", "tokenizer", "detokenizer.perl") unless $detokenizer; |
|
die "ERROR: Detokenizer script ".$detokenizer." does not exist. Dying" unless -f $detokenizer; |
|
|
|
|
|
my @testCases = (); |
|
|
|
|
|
|
|
|
|
|
|
|
|
&addDetokenizerTest("TEST_ENGLISH_EASY", "en", |
|
<<'TOK' |
|
This sentence is really simple , so it should not be hard to detokenize . |
|
This one is no more difficult , but , hey , it is on a new line . |
|
TOK |
|
, |
|
<<'EXP' |
|
This sentence is really simple, so it should not be hard to detokenize. |
|
This one is no more difficult, but, hey, it is on a new line. |
|
EXP |
|
); |
|
|
|
|
|
&addDetokenizerTest("TEST_ENGLISH_DOUBLEQUOTES", "en", |
|
<<'TOK' |
|
This is a somewhat " less simple " test . |
|
TOK |
|
, |
|
<<'EXP' |
|
This is a somewhat "less simple" test. |
|
EXP |
|
); |
|
|
|
|
|
&addDetokenizerTest("TEST_FRENCH_EASY", "fr", |
|
<<'TOK' |
|
Voici une phrase simple . |
|
TOK |
|
, |
|
<<'EXP' |
|
Voici une phrase simple. |
|
EXP |
|
); |
|
|
|
|
|
&addDetokenizerTest("TEST_FRENCH_APOSTROPHE", "fr", |
|
<<'TOK' |
|
Moi , j' ai une apostrophe . |
|
TOK |
|
, |
|
<<'EXP' |
|
Moi, j'ai une apostrophe. |
|
EXP |
|
); |
|
|
|
|
|
&addDetokenizerTest("TEST_FRENCH_APOSTROPHE_PENULTIMATE", "fr", |
|
<<'TOK' |
|
de musique rap issus de l' immigration |
|
TOK |
|
, |
|
<<'EXP' |
|
de musique rap issus de l'immigration |
|
EXP |
|
); |
|
|
|
|
|
|
|
&addDetokenizerTest("TEST_GERMAN_NONASCII", undef, |
|
<<'TOK' |
|
Ich hoffe , daß Sie schöne Ferien hatten . |
|
Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen |
|
TOK |
|
, |
|
<<'EXP' |
|
Ich hoffe, daß Sie schöne Ferien hatten. |
|
Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen |
|
EXP |
|
); |
|
|
|
|
|
&addDetokenizerTest("TEST_CHINESE_EASY", undef, |
|
<<'TOK' |
|
这 是 一个 简单 的的 汉语 句子 。 |
|
TOK |
|
, |
|
<<'EXP' |
|
这是一个简单的的汉语句子。 |
|
EXP |
|
); |
|
|
|
|
|
&addDetokenizerTest("TEST_JAPANESE_EASY", undef, |
|
<<'TOK' |
|
どう しょ う か な 。 |
|
どこ で 食べ たい 。 |
|
TOK |
|
, |
|
<<'EXP' |
|
どうしょうかな。 |
|
どこで食べたい。 |
|
EXP |
|
); |
|
|
|
|
|
|
|
|
|
|
|
|
|
plan tests => scalar(@testCases); |
|
|
|
foreach my $testCase (@testCases) { |
|
&runDetokenizerTest($testCase); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
sub addDetokenizerTest { |
|
my ($testName, $language, $tokenizedText, $rightAnswer) = @_; |
|
|
|
my $testCase = new DetokenizerTestCase($testName, $language, $tokenizedText, $rightAnswer); |
|
push(@testCases, $testCase); |
|
return $testCase; |
|
} |
|
|
|
sub runDetokenizerTest { |
|
my ($testCase) = @_; |
|
|
|
my $testOutputDir = catfile($results_dir, $testCase->getName()); |
|
my $tokenizedFile = catfile($testOutputDir, "input.txt"); |
|
my $expectedFile = catfile($testOutputDir, "expected.txt"); |
|
|
|
|
|
unless (mkdir($testOutputDir)) { |
|
return fail($testCase->getName().": Failed to create output directory ".$testOutputDir." [".$!."]"); |
|
} |
|
|
|
open TOK, ">".$tokenizedFile; |
|
binmode TOK, ":utf8"; |
|
print TOK $testCase->getTokenizedText(); |
|
close TOK; |
|
|
|
open TRUTH, ">".$expectedFile; |
|
binmode TRUTH, ":utf8"; |
|
print TRUTH $testCase->getRightAnswer(); |
|
close TRUTH; |
|
|
|
&runTest($testCase->getName(), $testOutputDir, $tokenizedFile, sub { |
|
return defined($testCase->getLanguage()) ? [$detokenizer, "-l", $testCase->getLanguage()] : [$detokenizer]; |
|
}, sub { |
|
&verifyIdentical($testCase->getName(), $expectedFile, catfile($testOutputDir, "stdout.txt")) |
|
}, 1, $testCase->getFailureExplanation()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub runTest { |
|
my ($testName, $outputDir, $stdinFile, $buildCommandRoutineReference, $validationRoutineReference, $separateStdoutFromStderr, $failureExplanation) = @_; |
|
|
|
my ($stdoutFile, $stderrFile); |
|
if ($separateStdoutFromStderr) { |
|
$stdoutFile = catfile($outputDir, "stdout.txt"); |
|
$stderrFile = catfile($outputDir, "stderr.txt"); |
|
} else { |
|
$stdoutFile = catfile($outputDir, "stdout-and-stderr.txt"); |
|
$stderrFile = $stdoutFile; |
|
} |
|
|
|
my $commandRef = $buildCommandRoutineReference->(); |
|
my $exitStatus = &runVerbosely($commandRef, $stdinFile, $stdoutFile, $stderrFile); |
|
return fail($testName.": command exited with status ".$exitStatus) unless $exitStatus == 0; |
|
|
|
if (defined $failureExplanation) { |
|
TODO: { |
|
local $TODO = $failureExplanation; |
|
$validationRoutineReference->(); |
|
} |
|
} else { |
|
$validationRoutineReference->(); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
sub runVerbosely { |
|
my ($commandRef, $stdinFile, $stdoutFile, $stderrFile) = @_; |
|
my @command = @{$commandRef}; |
|
note("Executing command:\n @command\n"); |
|
note("standard input coming from: ".$stdinFile) if defined $stdinFile; |
|
note("standard output going to: ".$stdoutFile) if defined $stdoutFile; |
|
note("standard error going to: ".$stderrFile) if defined $stderrFile; |
|
run3($commandRef, $stdinFile, $stdoutFile, $stderrFile); |
|
return $?; |
|
} |
|
|
|
|
|
sub verifyIdentical { |
|
my ($testName, $referenceFile, $outputFile) = @_; |
|
|
|
open(REF, $referenceFile) or return fail($testName.": Can't open reference file ".$referenceFile." [".$!."]."); |
|
open(OUT, $outputFile) or return fail($testName.": Can't open output file ".$outputFile." [".$!."]."); |
|
my @referenceFileAsArray = <REF>; |
|
my @outputFileAsArray = <OUT>; |
|
close(REF); |
|
close(OUT); |
|
is_deeply(\@outputFileAsArray, \@referenceFileAsArray, $testName.": Output file ".$outputFile." matches reference file ".$referenceFile."."); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
package DetokenizerTestCase; |
|
|
|
|
|
sub new { |
|
my $class = shift; |
|
my $self = { |
|
_name => shift, |
|
_language => shift, |
|
_tokenizedText => shift, |
|
_rightAnswer => shift, |
|
|
|
_failureExplanation => undef |
|
}; |
|
bless $self, $class; |
|
} |
|
|
|
sub getName { |
|
my ($self) = @_; |
|
return $self->{_name}; |
|
} |
|
|
|
sub getLanguage { |
|
my ($self) = @_; |
|
return $self->{_language}; |
|
} |
|
|
|
sub getTokenizedText { |
|
my ($self) = @_; |
|
return $self->{_tokenizedText}; |
|
} |
|
|
|
sub getRightAnswer { |
|
my ($self) = @_; |
|
return $self->{_rightAnswer}; |
|
} |
|
|
|
|
|
|
|
|
|
sub setExpectedToFail { |
|
my ($self, $failureExplanation) = @_; |
|
$self->{_failureExplanation} = $failureExplanation || "This test is expected to fail."; |
|
} |
|
|
|
|
|
|
|
sub getFailureExplanation { |
|
my ($self) = @_; |
|
return $self->{_failureExplanation}; |
|
} |
|
|