|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "ug_mm_ttrack.h" |
|
#include "ug_deptree.h" |
|
#include "tpt_tokenindex.h" |
|
#include "tpt_pickler.h" |
|
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h" |
|
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" |
|
|
|
#include <iostream> |
|
#include <string> |
|
#include <sstream> |
|
|
|
#include <boost/program_options.hpp> |
|
#include <boost/scoped_ptr.hpp> |
|
|
|
#include "util/exception.hh" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
using namespace std; |
|
using namespace ugdiss; |
|
using namespace sapt; |
|
|
|
ofstream t1out,t2out,mam; |
|
int len1=0,len2=0; |
|
size_t lineCtr=0,sid=0; |
|
bool conll=false; |
|
bool skip=false; |
|
bool debug=false; |
|
TokenIndex V1; |
|
|
|
string mtt1name,mtt2name,o1name,o2name,mamname,cfgFile; |
|
string dataFormat,A3filename; |
|
void |
|
interpret_args(int ac, char* av[]) |
|
{ |
|
namespace po=boost::program_options; |
|
po::variables_map vm; |
|
po::options_description o("Options"); |
|
po::options_description h("Hidden Options"); |
|
po::positional_options_description a; |
|
|
|
o.add_options() |
|
("help,h", "print this message") |
|
("cfg,f", po::value<string>(&cfgFile),"config file") |
|
("a3", po::value<string>(&A3filename), "name of A3 file (for sanity checks)") |
|
("o1", po::value<string>(&o1name), "name of output file for track 1") |
|
("o2", po::value<string>(&o2name), "name of output file for track 2") |
|
("skip", "skip sentence pairs without word alignment (requires --o1 and --o2)") |
|
("debug,d", "debug mode") |
|
("t1", po::value<string>(&mtt1name), "file name of L1 mapped token track") |
|
("t2", po::value<string>(&mtt2name), "file name of L2 mapped token track") |
|
("format,F", po::value<string>(&dataFormat)->default_value("plain"), "data format (plain or conll)") |
|
; |
|
|
|
h.add_options() |
|
("mamname", po::value<string>(&mamname), "name of output file for mam") |
|
; |
|
a.add("mamname",1); |
|
|
|
get_options(ac,av,h.add(o),a,vm,"cfg"); |
|
|
|
skip = vm.count("skip"); |
|
debug = vm.count("debug"); |
|
if (vm.count("help") || mamname.empty()) |
|
{ |
|
cout << "usage:\n" |
|
<< "\t\n" |
|
<< "\t ... | " << av[0] |
|
<< " <.mam file> \n" << endl; |
|
cout << o << endl; |
|
cout << "If an A3 file is given (as produced by (m)giza), symal2mam performs\n" |
|
<< "a sanity check to make sure that sentence lengths match." << endl; |
|
exit(0); |
|
} |
|
conll = dataFormat == "conll"; |
|
if (!conll and dataFormat != "plain") |
|
{ |
|
cerr << "format must be 'conll' or 'plain'" << endl; |
|
exit(1); |
|
} |
|
if (skip && (o1name.empty() || o2name.empty())) |
|
{ |
|
cerr << "--skip requires --o1 and --o2" << endl; |
|
exit(1); |
|
} |
|
} |
|
|
|
template<typename track_t> |
|
void |
|
copySentence(track_t const& T, size_t sid, ostream& dest) |
|
{ |
|
char const* a = reinterpret_cast<char const*>(T.sntStart(sid)); |
|
char const* z = reinterpret_cast<char const*>(T.sntEnd(sid)); |
|
dest.write(a,z-a); |
|
} |
|
|
|
size_t |
|
procSymalLine(string const& line, ostream& out) |
|
{ |
|
ushort a,b; char dash; |
|
istringstream buf(line); |
|
while (buf>>a>>dash>>b) |
|
{ |
|
if (debug && ((len1 && a >= len1) || (len2 && b >= len2))) |
|
{ |
|
cerr << a << "-" << b << " " << len1 << "/" << len2 << endl; |
|
} |
|
assert(len1 == 0 || a<len1); |
|
assert(len2 == 0 || b<len2); |
|
tpt::binwrite(out,a); |
|
tpt::binwrite(out,b); |
|
} |
|
return out.tellp(); |
|
} |
|
|
|
void finiMAM(ofstream& out, vector<id_type>& idx, id_type numTok) |
|
{ |
|
id_type offset = sizeof(filepos_type)+2*sizeof(id_type); |
|
filepos_type idxStart = out.tellp(); |
|
for (vector<id_type>::iterator i = idx.begin(); i != idx.end(); ++i) |
|
tpt::numwrite(out,*i-offset); |
|
out.seekp(0); |
|
tpt::numwrite(out,idxStart); |
|
tpt::numwrite(out,id_type(idx.size()-1)); |
|
tpt::numwrite(out,numTok); |
|
out.close(); |
|
} |
|
|
|
void |
|
finalize(ofstream& out, vector<id_type> const& idx, id_type tokenCount) |
|
{ |
|
id_type idxSize = idx.size(); |
|
filepos_type idxStart = out.tellp(); |
|
for (size_t i = 0; i < idx.size(); ++i) |
|
tpt::numwrite(out,idx[i]); |
|
out.seekp(0); |
|
tpt::numwrite(out,idxStart); |
|
tpt::numwrite(out,idxSize-1); |
|
tpt::numwrite(out,tokenCount); |
|
out.close(); |
|
} |
|
|
|
bool getCheckValues(istream& in, int& check1, int& check2) |
|
{ |
|
if (A3filename.empty()) return true; |
|
string line; string w; |
|
getline(in,line); |
|
size_t p1 = line.find("source length ") + 14; |
|
if (p1 >= line.size()) return false; |
|
size_t p2 = line.find("target length ",p1); |
|
if (p2 >= line.size()) return false; |
|
|
|
|
|
check1 = atoi(line.substr(p1,p2-p1).c_str()); |
|
p1 = p2+14; |
|
p2 = line.find("alignment ",p1); |
|
if (p2 >= line.size()) return false; |
|
check2 = atoi(line.substr(p1,p2-p1).c_str()); |
|
getline(in,line); |
|
getline(in,line); |
|
return true; |
|
} |
|
|
|
void |
|
go() |
|
{ |
|
size_t ctr=0; |
|
vector<id_type> idxm; |
|
idxm.reserve(10000000); |
|
idxm.push_back(mam.tellp()); |
|
string line; |
|
while(getline(cin,line)) |
|
{ |
|
idxm.push_back(procSymalLine(line,mam)); |
|
if (debug && ++ctr%100000==0) |
|
cerr << ctr/1000 << "K lines processed" << endl; |
|
} |
|
finiMAM(mam,idxm,0); |
|
cout << idxm.size() << endl; |
|
} |
|
|
|
template<typename TKN> |
|
void |
|
go(string t1name, string t2name, string A3filename) |
|
{ |
|
typedef mmTtrack<TKN> track_t; |
|
track_t T1(t1name),T2(t2name); |
|
boost::iostreams::filtering_istream A3file; |
|
open_input_stream(A3filename, A3file); |
|
|
|
string line; int check1=-1,check2=-1; |
|
vector<id_type> idx1(1,0),idx2(1,0),idxm(1, mam.tellp()); |
|
size_t tokenCount1=0,tokenCount2=0; |
|
size_t skipCtr=0,lineCtr=0; |
|
if (!getCheckValues(A3file, check1, check2)) |
|
UTIL_THROW(util::Exception, "Mismatch in input files!"); |
|
|
|
for (sid = 0; sid < T1.size(); ++sid) |
|
{ |
|
len1 = T1.sntLen(sid); |
|
len2 = T2.sntLen(sid); |
|
if (debug) |
|
cerr << "[" << lineCtr << "] " |
|
<< len1 << " (" << check1 << ") / " |
|
<< len2 << " (" << check2 << ")" << endl; |
|
if ((check1 >=0 && check1!=len1) || |
|
(check2 >=0 && check2!=len2)) |
|
{ |
|
if (skip) |
|
{ |
|
cerr << "[" << ++skipCtr << "] skipping " |
|
<< check1 << "/" << check2 << " vs. " |
|
<< len1 << "/" << len2 |
|
<< " at line " << lineCtr << endl; |
|
} |
|
else |
|
{ |
|
idxm.push_back(mam.tellp()); |
|
} |
|
if (len1 > 100 || len2 > 100) |
|
{ |
|
getline(cin,line); |
|
getCheckValues(A3file,check1,check2); |
|
lineCtr++; |
|
} |
|
continue; |
|
} |
|
if (skip) |
|
{ |
|
idx1.push_back(tokenCount1 += len1); |
|
copySentence(T1,sid,t1out); |
|
idx2.push_back(tokenCount2 += len2); |
|
copySentence(T2,sid,t2out); |
|
} |
|
|
|
if (!getline(cin,line)) |
|
UTIL_THROW(util::Exception, "Too few lines in symal input!"); |
|
|
|
lineCtr++; |
|
idxm.push_back(procSymalLine(line,mam)); |
|
if (debug) cerr << "[" << lineCtr << "] " |
|
<< check1 << " (" << len1 <<") " |
|
<< check2 << " (" << len2 <<") " |
|
<< line << endl; |
|
getCheckValues(A3file,check1,check2); |
|
} |
|
if (skip) |
|
{ |
|
finalize(t1out,idx1,tokenCount1); |
|
finalize(t2out,idx2,tokenCount2); |
|
} |
|
finiMAM(mam,idxm,0); |
|
cout << idxm.size() << endl; |
|
} |
|
|
|
void |
|
initialize(ofstream& out, string const& fname) |
|
{ |
|
out.open(fname.c_str()); |
|
tpt::numwrite(out,filepos_type(0)); |
|
tpt::numwrite(out,id_type(0)); |
|
tpt::numwrite(out,id_type(0)); |
|
} |
|
|
|
int main(int argc, char* argv[]) |
|
{ |
|
interpret_args(argc,argv); |
|
if (skip) |
|
{ |
|
initialize(t1out,o1name); |
|
initialize(t2out,o2name); |
|
} |
|
initialize(mam,mamname); |
|
if (A3filename.size() == 0) |
|
go(); |
|
else if (conll) |
|
go<Conll_Record>(mtt1name,mtt2name,A3filename); |
|
else |
|
go<id_type>(mtt1name,mtt2name,A3filename); |
|
} |
|
|