sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /scripts /training /combine_factors_syntax.pl

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

3.35 kB

	#!/usr/bin/env perl
	#
	# This file is part of moses. Its use is licensed under the GNU Lesser General
	# Public License version 2.1 or, at your option, any later version.

	# $Id$
	# given a list of files, combines them to a single corpus (sent to stdout)

	use strict;
	use warnings;
	use Getopt::Long;
	use IO::File;
	use File::Basename;

	binmode(STDIN, ":utf8");
	binmode(STDOUT, ":utf8");
	binmode(STDERR, ":utf8");

	my @addfactors = @ARGV;
	die "usage: combine_factors.pl corpusfile1 corpusfile2 ..."
	if 0 == scalar @addfactors;

	my @streams = map {
	my $fn = $_;
	my $opn = ($fn =~ /\.gz$/ ? "zcat $fn \|" : "$fn");
	my $stream = new IO::File;
	$stream->open($opn) or die "Can't open '$opn'";
	binmode($stream, ":utf8");
	$stream;
	} @addfactors;

	my $nr=0;
	my $firststream = shift @streams;
	shift @addfactors; # just to keep the lengths sync'ed
	$_ = readline($firststream);
	while (defined $_) {
	$nr++;
	print STDERR "." if $nr % 10000 == 0;
	print STDERR "($nr)" if $nr % 100000 == 0;
	my ($intokens,$MARKUP) = split_xml($_);
	# load lines of corresponding streams and ensure equal number of words
	my @lines_of_extratoks;
	foreach my $factor (0..$#streams) {
	my $line = readline($streams[$factor]);
	die "Additional factor file $addfactors[$factor] contains too few sentences!"
	if !defined $line;
	chomp($line);
	$line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
	my @toks = split / /, $line;
	die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)"
	if $#toks != $#$intokens;
	$lines_of_extratoks[$factor] = \@toks;
	}

	# for every token, print the factors in the order as user wished
	for(my $i=0; $i<=$#$intokens; $i++) {
	print "" if $i && $$MARKUP[$i] eq '';
	print $$MARKUP[$i];

	my $token = $$intokens[$i];
	my @outtoken = ();
	push @outtoken, $token; # add the first one
	# print STDERR "Token: $token\n";
	foreach my $factor (0..$#streams) {
	my $f = $lines_of_extratoks[$factor]->[$i];
	die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]"
	if !defined $f \|\| $f eq "";
	push @outtoken, $f;
	}
	print " " if $i != 0;
	print join("\|", @outtoken);
	}
	print $$MARKUP[$#$MARKUP];
	print "\n";
	$_ = readline($firststream);
	}
	close $firststream;
	print STDERR "Done.\n";

	# store away xml markup
	sub split_xml {
	my ($line) = @_;
	my (@WORD,@MARKUP);
	my $i = 0;
	$MARKUP[0] = "";
	while($line =~ /\S/) {
	# XML tag
	if ($line =~ /^\s(<\S[^>]>)(.*)$/) {
	my $potential_xml = $1;
	my $line_next = $2;
	# exception for factor that is an XML tag
	if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\\|$/) {
	$WORD[$i-1] .= $potential_xml;
	if ($line_next =~ /^(\\|+)(.*)$/) {
	$WORD[$i-1] .= $1;
	$line_next = $2;
	}
	}
	else {
	$MARKUP[$i] .= $potential_xml." ";
	}
	$line = $line_next;
	}
	# non-XML text
	elsif ($line =~ /^\s([^\s<>]+)(.)$/) {
	$WORD[$i++] = $1;
	$MARKUP[$i] = "";
	$line = $2;
	}
	# '<' or '>' occurs in word, but it's not an XML tag
	elsif ($line =~ /^\s(\S+)(.)$/) {
	$WORD[$i++] = $1;
	$MARKUP[$i] = "";
	$line = $2;
	}
	else {
	die("ERROR: huh? $line\n");
	}
	}
	chop($MARKUP[$#MARKUP]);
	return (\@WORD,\@MARKUP);
	}