File size: 9,681 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
#!/usr/bin/env perl
#
# Detokenization tests.
#
use warnings;
use strict;
# This is here to suppress (false) warnings about OLDOUT and OLDERR being used only once. Maybe there is a less brutish way to suppress that, but I don't know it.
no warnings 'once';
use utf8;
use Cwd ('abs_path');
use File::Spec::Functions;
use File::Basename ('dirname');
use IPC::Run3;
use Getopt::Long;
use Test::More;
GetOptions("detokenizer=s" => \(my $detokenizer),
"results-dir=s"=> \(my $results_dir)
) or exit 1;
unless (defined $results_dir) {
print STDERR "Usage: run-test-detokenizer.t --results-dir <RESULTS-DIRECTORY> [--detokenizer <DETOKENIZER-SCRIPT>]\n";
exit 1;
}
die "ERROR: Results directory ".$results_dir." doesn't exist or is not a writable directory. Dying" unless (-d $results_dir && -w $results_dir);
$detokenizer = catfile(dirname(dirname(abs_path($0))), "scripts", "tokenizer", "detokenizer.perl") unless $detokenizer;
die "ERROR: Detokenizer script ".$detokenizer." does not exist. Dying" unless -f $detokenizer;
my @testCases = ();
######################################
# Definitions of individual test cases
######################################
# A simple English test
&addDetokenizerTest("TEST_ENGLISH_EASY", "en",
<<'TOK'
This sentence is really simple , so it should not be hard to detokenize .
This one is no more difficult , but , hey , it is on a new line .
TOK
,
<<'EXP'
This sentence is really simple, so it should not be hard to detokenize.
This one is no more difficult, but, hey, it is on a new line.
EXP
);
# An English test involving double-quotes
&addDetokenizerTest("TEST_ENGLISH_DOUBLEQUOTES", "en",
<<'TOK'
This is a somewhat " less simple " test .
TOK
,
<<'EXP'
This is a somewhat "less simple" test.
EXP
);
# A simple French test
&addDetokenizerTest("TEST_FRENCH_EASY", "fr",
<<'TOK'
Voici une phrase simple .
TOK
,
<<'EXP'
Voici une phrase simple.
EXP
);
# A French test involving an apostrophe
&addDetokenizerTest("TEST_FRENCH_APOSTROPHE", "fr",
<<'TOK'
Moi , j' ai une apostrophe .
TOK
,
<<'EXP'
Moi, j'ai une apostrophe.
EXP
);
# A French test involving an apostrophe on the second-last word
&addDetokenizerTest("TEST_FRENCH_APOSTROPHE_PENULTIMATE", "fr",
<<'TOK'
de musique rap issus de l' immigration
TOK
,
<<'EXP'
de musique rap issus de l'immigration
EXP
);
# A German test involving non-ASCII characters
# Note: We don't specify a language because the detokenizer errors if you pass in a language for which it has no special rules, of which German is an example.
&addDetokenizerTest("TEST_GERMAN_NONASCII", undef,
<<'TOK'
Ich hoffe , daß Sie schöne Ferien hatten .
Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen
TOK
,
<<'EXP'
Ich hoffe, daß Sie schöne Ferien hatten.
Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen
EXP
);
# A simple Chinese test
&addDetokenizerTest("TEST_CHINESE_EASY", undef,
<<'TOK'
这 是 一个 简单 的的 汉语 句子 。
TOK
,
<<'EXP'
这是一个简单的的汉语句子。
EXP
);
# A simple Japanese test
&addDetokenizerTest("TEST_JAPANESE_EASY", undef,
<<'TOK'
どう しょ う か な 。
どこ で 食べ たい 。
TOK
,
<<'EXP'
どうしょうかな。
どこで食べたい。
EXP
);
######################################
# Now run those babies ...
######################################
plan tests => scalar(@testCases);
foreach my $testCase (@testCases) {
&runDetokenizerTest($testCase);
}
############
## Utilities
############
# Creates a new detokenizer test case, adds it to the array of test cases to be run, and returns it.
sub addDetokenizerTest {
my ($testName, $language, $tokenizedText, $rightAnswer) = @_;
my $testCase = new DetokenizerTestCase($testName, $language, $tokenizedText, $rightAnswer);
push(@testCases, $testCase);
return $testCase;
}
sub runDetokenizerTest {
my ($testCase) = @_;
my $testOutputDir = catfile($results_dir, $testCase->getName());
my $tokenizedFile = catfile($testOutputDir, "input.txt");
my $expectedFile = catfile($testOutputDir, "expected.txt");
# Fail if we can't make the test output directory
unless (mkdir($testOutputDir)) {
return fail($testCase->getName().": Failed to create output directory ".$testOutputDir." [".$!."]");
}
open TOK, ">".$tokenizedFile;
binmode TOK, ":utf8";
print TOK $testCase->getTokenizedText();
close TOK;
open TRUTH, ">".$expectedFile;
binmode TRUTH, ":utf8";
print TRUTH $testCase->getRightAnswer();
close TRUTH;
&runTest($testCase->getName(), $testOutputDir, $tokenizedFile, sub {
return defined($testCase->getLanguage()) ? [$detokenizer, "-l", $testCase->getLanguage()] : [$detokenizer];
}, sub {
&verifyIdentical($testCase->getName(), $expectedFile, catfile($testOutputDir, "stdout.txt"))
}, 1, $testCase->getFailureExplanation());
}
# $stdinFile, if defined, is a file to send to the command via STDIN
# $buildCommandRoutineReference is a reference to a zero-argument subroutine that returns the
# system command to run in the form of an array reference
# $validationRoutineReference is a reference to a zero-argument subroutine that makes exactly one call
# to ok() or similar to validate the contents of the output directory
# $separateStdoutFromStderr is an optional boolean argument; if omitted or false, the command's
# STDOUT and STDERR are mixed together in out output file called
# stdout-and-stderr.txt; otherwise, they are printed to separate output
# files called stdout.txt and stderr.txt, respectively
# $failureExplanation is an explanation of why the test is expected to fail. If the test is expected
# to pass, then this should be left undefined. Even in the case of a test that
# is expected to fail, the system command is still expected to exit normally --
# only the validation routine is expected to fail.
sub runTest {
my ($testName, $outputDir, $stdinFile, $buildCommandRoutineReference, $validationRoutineReference, $separateStdoutFromStderr, $failureExplanation) = @_;
my ($stdoutFile, $stderrFile);
if ($separateStdoutFromStderr) {
$stdoutFile = catfile($outputDir, "stdout.txt");
$stderrFile = catfile($outputDir, "stderr.txt");
} else {
$stdoutFile = catfile($outputDir, "stdout-and-stderr.txt");
$stderrFile = $stdoutFile;
}
my $commandRef = $buildCommandRoutineReference->();
my $exitStatus = &runVerbosely($commandRef, $stdinFile, $stdoutFile, $stderrFile);
return fail($testName.": command exited with status ".$exitStatus) unless $exitStatus == 0;
if (defined $failureExplanation) {
TODO: {
local $TODO = $failureExplanation;
$validationRoutineReference->();
}
} else {
$validationRoutineReference->();
}
}
# Announce that we're going to run the given command, then run it.
# $stdinFile, if defined, is a file to send to the command via STDIN
# $stdoutFile and $stderrFile, if defined, are file paths to which the command's standard output
# and standard error, respectively, are written. They can be the same file.
# The exit code of the command is returned.
sub runVerbosely {
my ($commandRef, $stdinFile, $stdoutFile, $stderrFile) = @_;
my @command = @{$commandRef};
note("Executing command:\n @command\n");
note("standard input coming from: ".$stdinFile) if defined $stdinFile;
note("standard output going to: ".$stdoutFile) if defined $stdoutFile;
note("standard error going to: ".$stderrFile) if defined $stderrFile;
run3($commandRef, $stdinFile, $stdoutFile, $stderrFile);
return $?;
}
# Verify that the given output file is identical to the given reference file.
sub verifyIdentical {
my ($testName, $referenceFile, $outputFile) = @_;
open(REF, $referenceFile) or return fail($testName.": Can't open reference file ".$referenceFile." [".$!."].");
open(OUT, $outputFile) or return fail($testName.": Can't open output file ".$outputFile." [".$!."].");
my @referenceFileAsArray = <REF>;
my @outputFileAsArray = <OUT>;
close(REF);
close(OUT);
is_deeply(\@outputFileAsArray, \@referenceFileAsArray, $testName.": Output file ".$outputFile." matches reference file ".$referenceFile.".");
}
##%%%%%%%%%%%%%%%%%%%%%%%%%%%##
## DetokenizerTestCase class ##
package DetokenizerTestCase;
# Constructor
sub new {
my $class = shift;
my $self = {
_name => shift,
_language => shift,
_tokenizedText => shift,
_rightAnswer => shift,
_failureExplanation => undef
};
bless $self, $class;
}
sub getName {
my ($self) = @_;
return $self->{_name};
}
sub getLanguage {
my ($self) = @_;
return $self->{_language};
}
sub getTokenizedText {
my ($self) = @_;
return $self->{_tokenizedText};
}
sub getRightAnswer {
my ($self) = @_;
return $self->{_rightAnswer};
}
# Call this routine to indicate that this test case is expected to fail.
# (The detokenizer script is still expected to exit normally, but the output is not expected to
# match the right answer because of a bug or unimplemented use case.)
sub setExpectedToFail {
my ($self, $failureExplanation) = @_;
$self->{_failureExplanation} = $failureExplanation || "This test is expected to fail.";
}
# Returns a string explaining why this test is expected to fail, or undef if this test is expected
# to pass.
sub getFailureExplanation {
my ($self) = @_;
return $self->{_failureExplanation};
}
|