#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. use warnings; use strict; die("ERROR syntax: reference-from-sgm.perl ref src out") unless scalar @ARGV == 3; my ($ref,$src,$txt) = @ARGV; # get order of the documents my @ORDER; open(ORDER,$src) || die("ERROR not found: $src"); while() { next unless /docid="([^\"]+)"/; push @ORDER,$1; } close(ORDER); # get from sgm file which lines belong to which system my %DOC; my $system_from_refset = 0; my ($doc,$system); open(REF,$ref) or die "Cannot open: $!"; while(my $line = ) { if ($line =~ /]+>\s*(.*)\s*$/i && $line !~ /]+>\s*(.*)\s*<\/seg>/i) { my $next_line = ; $line .= $next_line; chop($line); } if ($line =~ /]+>\s*(.+)\s*<\/seg>/i) { push @{$DOC{$system}{$doc}},$1; } } close(REF); my $i=0; foreach my $system (keys %DOC) { my $outfile = $txt; if (scalar keys %DOC > 1) { if ($outfile =~ /\.\d+$/) { $outfile .= ".ref$i"; } else { $outfile .= $i; } } open(TXT,">$outfile") || die($outfile); foreach my $doc (@ORDER) { die("can't find '$doc' for ref '$system'") unless defined $DOC{$system}{$doc}; foreach my $line (@{$DOC{$system}{$doc}}) { print TXT $line."\n"; } } close(TXT); $i++; }