|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use strict; |
|
use warnings; |
|
use Getopt::Long; |
|
use IO::File; |
|
use File::Basename; |
|
|
|
binmode(STDIN, ":utf8"); |
|
binmode(STDOUT, ":utf8"); |
|
binmode(STDERR, ":utf8"); |
|
|
|
my @addfactors = @ARGV; |
|
die "usage: combine_factors.pl corpusfile1 corpusfile2 ..." |
|
if 0 == scalar @addfactors; |
|
|
|
my @streams = map { |
|
my $fn = $_; |
|
my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn"); |
|
my $stream = new IO::File; |
|
$stream->open($opn) or die "Can't open '$opn'"; |
|
binmode($stream, ":utf8"); |
|
$stream; |
|
} @addfactors; |
|
|
|
my $nr=0; |
|
my $firststream = shift @streams; |
|
shift @addfactors; |
|
$_ = readline($firststream); |
|
while (defined $_) { |
|
$nr++; |
|
print STDERR "." if $nr % 10000 == 0; |
|
print STDERR "($nr)" if $nr % 100000 == 0; |
|
my ($intokens,$MARKUP) = split_xml($_); |
|
|
|
my @lines_of_extratoks; |
|
foreach my $factor (0..$#streams) { |
|
my $line = readline($streams[$factor]); |
|
die "Additional factor file $addfactors[$factor] contains too few sentences!" |
|
if !defined $line; |
|
chomp($line); |
|
$line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//; |
|
my @toks = split / /, $line; |
|
die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)" |
|
if $#toks != $#$intokens; |
|
$lines_of_extratoks[$factor] = \@toks; |
|
} |
|
|
|
|
|
for(my $i=0; $i<=$#$intokens; $i++) { |
|
print "" if $i && $$MARKUP[$i] eq ''; |
|
print $$MARKUP[$i]; |
|
|
|
my $token = $$intokens[$i]; |
|
my @outtoken = (); |
|
push @outtoken, $token; |
|
|
|
foreach my $factor (0..$#streams) { |
|
my $f = $lines_of_extratoks[$factor]->[$i]; |
|
die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]" |
|
if !defined $f || $f eq ""; |
|
push @outtoken, $f; |
|
} |
|
print " " if $i != 0; |
|
print join("|", @outtoken); |
|
} |
|
print $$MARKUP[$#$MARKUP]; |
|
print "\n"; |
|
$_ = readline($firststream); |
|
} |
|
close $firststream; |
|
print STDERR "Done.\n"; |
|
|
|
|
|
sub split_xml { |
|
my ($line) = @_; |
|
my (@WORD,@MARKUP); |
|
my $i = 0; |
|
$MARKUP[0] = ""; |
|
while($line =~ /\S/) { |
|
|
|
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { |
|
my $potential_xml = $1; |
|
my $line_next = $2; |
|
|
|
if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { |
|
$WORD[$i-1] .= $potential_xml; |
|
if ($line_next =~ /^(\|+)(.*)$/) { |
|
$WORD[$i-1] .= $1; |
|
$line_next = $2; |
|
} |
|
} |
|
else { |
|
$MARKUP[$i] .= $potential_xml." "; |
|
} |
|
$line = $line_next; |
|
} |
|
|
|
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { |
|
$WORD[$i++] = $1; |
|
$MARKUP[$i] = ""; |
|
$line = $2; |
|
} |
|
|
|
elsif ($line =~ /^\s*(\S+)(.*)$/) { |
|
$WORD[$i++] = $1; |
|
$MARKUP[$i] = ""; |
|
$line = $2; |
|
} |
|
else { |
|
die("ERROR: huh? $line\n"); |
|
} |
|
} |
|
chop($MARKUP[$#MARKUP]); |
|
return (\@WORD,\@MARKUP); |
|
} |
|
|
|
|
|
|
|
|