|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long; |
|
my $help; |
|
my $lc = 0; |
|
my $ignore_ratio = 0; |
|
my $ignore_xml = 0; |
|
my $enc = "utf8"; |
|
|
|
my $max_word_length = 1000; |
|
|
|
|
|
my $ratio = 9; |
|
|
|
GetOptions( |
|
"help" => \$help, |
|
"lowercase|lc" => \$lc, |
|
"encoding=s" => \$enc, |
|
"ratio=f" => \$ratio, |
|
"ignore-ratio" => \$ignore_ratio, |
|
"ignore-xml" => \$ignore_xml, |
|
"max-word-length|mwl=s" => \$max_word_length |
|
) or exit(1); |
|
|
|
if (scalar(@ARGV) < 6 || $help) { |
|
print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n"; |
|
exit; |
|
} |
|
|
|
my $corpus = $ARGV[0]; |
|
my $l1 = $ARGV[1]; |
|
my $l2 = $ARGV[2]; |
|
my $out = $ARGV[3]; |
|
my $min = $ARGV[4]; |
|
my $max = $ARGV[5]; |
|
|
|
my $linesRetainedFile = ""; |
|
if (scalar(@ARGV) > 6) { |
|
$linesRetainedFile = $ARGV[6]; |
|
open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile"; |
|
} |
|
|
|
print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n"; |
|
|
|
my $opn = undef; |
|
my $l1input = "$corpus.$l1"; |
|
if (-e $l1input) { |
|
$opn = $l1input; |
|
} elsif (-e $l1input.".gz") { |
|
$opn = "gunzip -c $l1input.gz |"; |
|
} else { |
|
die "Error: $l1input does not exist"; |
|
} |
|
open(F,$opn) or die "Can't open '$opn'"; |
|
$opn = undef; |
|
my $l2input = "$corpus.$l2"; |
|
if (-e $l2input) { |
|
$opn = $l2input; |
|
} elsif (-e $l2input.".gz") { |
|
$opn = "gunzip -c $l2input.gz |"; |
|
} else { |
|
die "Error: $l2input does not exist"; |
|
} |
|
|
|
open(E,$opn) or die "Can't open '$opn'"; |
|
|
|
open(FO,">$out.$l1") or die "Can't write $out.$l1"; |
|
open(EO,">$out.$l2") or die "Can't write $out.$l2"; |
|
|
|
|
|
my $binmode; |
|
if ($enc eq "utf8") { |
|
$binmode = ":utf8"; |
|
} else { |
|
$binmode = ":encoding($enc)"; |
|
} |
|
binmode(F, $binmode); |
|
binmode(E, $binmode); |
|
binmode(FO, $binmode); |
|
binmode(EO, $binmode); |
|
|
|
my $innr = 0; |
|
my $outnr = 0; |
|
my $factored_flag; |
|
while(my $f = <F>) { |
|
$innr++; |
|
print STDERR "." if $innr % 10000 == 0; |
|
print STDERR "($innr)" if $innr % 100000 == 0; |
|
my $e = <E>; |
|
die "$corpus.$l2 is too short!" if !defined $e; |
|
chomp($e); |
|
chomp($f); |
|
if ($innr == 1) { |
|
$factored_flag = ($e =~ /\|/ || $f =~ /\|/); |
|
} |
|
|
|
|
|
if ($lc) { |
|
$e = lc($e); |
|
$f = lc($f); |
|
} |
|
|
|
$e =~ s/\|//g unless $factored_flag; |
|
$e =~ s/\s+/ /g; |
|
$e =~ s/^ //; |
|
$e =~ s/ $//; |
|
$f =~ s/\|//g unless $factored_flag; |
|
$f =~ s/\s+/ /g; |
|
$f =~ s/^ //; |
|
$f =~ s/ $//; |
|
next if $f eq ''; |
|
next if $e eq ''; |
|
|
|
my $ec = &word_count($e); |
|
my $fc = &word_count($f); |
|
next if $ec > $max; |
|
next if $fc > $max; |
|
next if $ec < $min; |
|
next if $fc < $min; |
|
next if !$ignore_ratio && $ec/$fc > $ratio; |
|
next if !$ignore_ratio && $fc/$ec > $ratio; |
|
|
|
my $max_word_length_plus_one = $max_word_length + 1; |
|
next if $e =~ /[^\s\|]{$max_word_length_plus_one}/; |
|
next if $f =~ /[^\s\|]{$max_word_length_plus_one}/; |
|
|
|
|
|
die "There is a blank factor in $corpus.$l1 on line $innr: $f" |
|
if $f =~ /[ \|]\|/; |
|
die "There is a blank factor in $corpus.$l2 on line $innr: $e" |
|
if $e =~ /[ \|]\|/; |
|
|
|
$outnr++; |
|
print FO $f."\n"; |
|
print EO $e."\n"; |
|
|
|
if ($linesRetainedFile ne "") { |
|
print LINES_RETAINED $innr."\n"; |
|
} |
|
} |
|
|
|
if ($linesRetainedFile ne "") { |
|
close LINES_RETAINED; |
|
} |
|
|
|
print STDERR "\n"; |
|
my $e = <E>; |
|
die "$corpus.$l2 is too long!" if defined $e; |
|
|
|
print STDERR "Input sentences: $innr Output sentences: $outnr\n"; |
|
|
|
sub word_count { |
|
my ($line) = @_; |
|
if ($ignore_xml) { |
|
$line =~ s/<\S[^>]*\S>/ /g; |
|
$line =~ s/\s+/ /g; |
|
$line =~ s/^ //g; |
|
$line =~ s/ $//g; |
|
} |
|
my @w = split(/ /,$line); |
|
return scalar @w; |
|
} |
|
|