|
|
|
|
|
|
|
|
|
|
|
|
|
#include <boost/algorithm/string/predicate.hpp> |
|
#include <boost/program_options.hpp> |
|
#include <boost/program_options/options_description.hpp> |
|
#include <boost/program_options/parsers.hpp> |
|
#include <boost/program_options/variables_map.hpp> |
|
#include <boost/iostreams/device/mapped_file.hpp> |
|
|
|
#include <iostream> |
|
#include <fstream> |
|
#include <sstream> |
|
#include <iomanip> |
|
#include <vector> |
|
#include <string> |
|
|
|
#include <sys/types.h> |
|
#include <sys/wait.h> |
|
|
|
#include "ug_conll_record.h" |
|
#include "tpt_tokenindex.h" |
|
#include "ug_mm_ttrack.h" |
|
#include "tpt_pickler.h" |
|
#include "ug_deptree.h" |
|
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" |
|
#include "moses/TranslationModel/UG/mm/ug_im_tsa.h" |
|
|
|
using namespace std; |
|
using namespace sapt; |
|
using namespace Moses; |
|
using namespace boost; |
|
using namespace boost::algorithm; |
|
namespace po=boost::program_options; |
|
|
|
int with_pfas; |
|
int with_dcas; |
|
int with_sfas; |
|
|
|
bool incremental = false; |
|
bool is_conll = false; |
|
bool quiet = false; |
|
|
|
string vocabBase; |
|
string baseName; |
|
string tmpFile, mttFile; |
|
|
|
|
|
string UNK; |
|
|
|
TokenIndex SF; |
|
TokenIndex LM; |
|
TokenIndex PS; |
|
TokenIndex DT; |
|
|
|
void interpret_args(int ac, char* av[]); |
|
|
|
inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; } |
|
|
|
id_type |
|
get_id(TokenIndex const& T, string const& w) |
|
{ |
|
id_type ret = T[w]; |
|
if (ret == 1 && w != UNK) |
|
{ |
|
cerr << "Warning! Unkown vocabulary item '" << w << "', but " |
|
<< "incremental mode (-i) is not set." << endl; |
|
assert(0); |
|
} |
|
return ret; |
|
} |
|
|
|
void |
|
open_vocab(TokenIndex& T, string fname) |
|
{ |
|
if (!access(fname.c_str(), F_OK)) |
|
{ |
|
T.open(fname,UNK); |
|
assert(T[UNK] == 1); |
|
} |
|
else T.setUnkLabel(UNK); |
|
if (incremental) T.setDynamic(true); |
|
assert(T["NULL"] == 0); |
|
assert(T[UNK] == 1); |
|
} |
|
|
|
void |
|
ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v) |
|
{ |
|
v.resize(T.totalVocabSize()); |
|
for (size_t i = 0; i < T.totalVocabSize(); ++i) |
|
{ |
|
v[i].first = T[i]; |
|
v[i].second = 0; |
|
} |
|
} |
|
|
|
void |
|
write_tokenindex(string fname, TokenIndex& T, vector<id_type> const& n2o) |
|
{ |
|
if (!quiet) cerr << "Writing " << fname << endl; |
|
vector<id_type> o2n(n2o.size()); |
|
for (id_type i = 0; i < n2o.size(); ++i) o2n[n2o[i]] = i; |
|
vector<pair<string,uint32_t> > v(n2o.size()); |
|
for (id_type i = 0; i < n2o.size(); ++i) |
|
{ |
|
v[i].first = T[n2o[i]]; |
|
v[i].second = i; |
|
} |
|
T.close(); |
|
sort(v.begin(),v.end()); |
|
write_tokenindex_to_disk(v, fname, UNK); |
|
} |
|
|
|
void init(int argc, char* argv[]) |
|
{ |
|
interpret_args(argc,argv); |
|
if (is_conll) |
|
{ |
|
open_vocab(SF, vocabBase+".tdx.sfo"); |
|
open_vocab(LM, vocabBase+".tdx.lem"); |
|
open_vocab(PS, vocabBase+".tdx.pos"); |
|
open_vocab(DT, vocabBase+".tdx.drl"); |
|
} |
|
else open_vocab(SF, vocabBase+".tdx"); |
|
} |
|
|
|
void fill_rec(Conll_Record& rec, vector<string> const& w) |
|
{ |
|
if (w.size() == 3) |
|
{ |
|
rec.sform = get_id(SF, w[0]); |
|
rec.lemma = get_id(LM, w[2] == "<UNKNOWN>" ? w[0] : w[2]); |
|
rec.majpos = rangeCheck(get_id(PS, w[1]), 256); |
|
rec.minpos = rangeCheck(get_id(PS, w[1]), 256); |
|
rec.dtype = 0; |
|
rec.parent = -1; |
|
} |
|
else if (w.size() >= 8) |
|
{ |
|
int id = atoi(w[0].c_str()); |
|
int gov = atoi(w[6].c_str()); |
|
rec.sform = get_id(SF, w[1]); |
|
rec.lemma = get_id(LM, w[2]); |
|
rec.majpos = rangeCheck(get_id(PS, w[3]), 256); |
|
rec.minpos = rangeCheck(get_id(PS, w[4]), 256); |
|
rec.dtype = get_id(DT, w[7]); |
|
rec.parent = gov ? gov - id : 0; |
|
} |
|
} |
|
|
|
void log_progress(size_t ctr) |
|
{ |
|
if (ctr % 100000 == 0) |
|
{ |
|
if (ctr) cerr << endl; |
|
cerr << setw(12) << ctr / 1000 << "K sentences processed "; |
|
} |
|
else if (ctr % 10000 == 0) |
|
{ |
|
cerr << "."; |
|
} |
|
} |
|
|
|
|
|
size_t |
|
process_plain_input(ostream& out, vector<id_type> & s_index) |
|
{ |
|
id_type totalWords = 0; |
|
string line,w; |
|
while (getline(cin,line)) |
|
{ |
|
istringstream buf(line); |
|
if (!quiet) log_progress(s_index.size()); |
|
s_index.push_back(totalWords); |
|
while (buf>>w) |
|
{ |
|
tpt::numwrite(out,get_id(SF,w)); |
|
++totalWords; |
|
} |
|
} |
|
s_index.push_back(totalWords); |
|
return totalWords; |
|
} |
|
|
|
size_t |
|
process_tagged_input(ostream& out, |
|
vector<id_type> & s_index, |
|
vector<id_type> & p_index) |
|
{ |
|
string line; |
|
Conll_Record rec; |
|
bool new_sent = true; |
|
bool new_par = true; |
|
id_type totalWords = 0; |
|
|
|
while (getline(cin,line)) |
|
{ |
|
vector<string> w; string f; istringstream buf(line); |
|
while (buf>>f) w.push_back(f); |
|
|
|
if (w.size() == 0 || starts_with(w[0], "SID=")) |
|
new_sent = true; |
|
|
|
else if (w.size() == 1 && w[0] == "<P>") |
|
new_par = new_sent = true; |
|
|
|
if (w.size() < 3) continue; |
|
if (!quiet && new_sent) log_progress(s_index.size()); |
|
if (new_sent) { s_index.push_back(totalWords); new_sent = false; } |
|
if (new_par) { p_index.push_back(totalWords); new_par = false; } |
|
fill_rec(rec,w); |
|
out.write(reinterpret_cast<char const*>(&rec),sizeof(rec)); |
|
++totalWords; |
|
} |
|
s_index.push_back(totalWords); |
|
return totalWords; |
|
} |
|
|
|
size_t |
|
numberize() |
|
{ |
|
ofstream out(tmpFile.c_str()); |
|
filepos_type startIdx=0; |
|
id_type idxSize=0,totalWords=0; |
|
tpt::numwrite(out,startIdx); |
|
tpt::numwrite(out,idxSize); |
|
tpt::numwrite(out,totalWords); |
|
|
|
vector<id_type> s_index, p_index; |
|
|
|
if(is_conll) |
|
totalWords = process_tagged_input(out,s_index,p_index); |
|
else |
|
totalWords = process_plain_input(out,s_index); |
|
|
|
vector<id_type> const* index = &s_index; |
|
if (p_index.size() && p_index.back()) |
|
{ |
|
p_index.push_back(totalWords); |
|
index = &p_index; |
|
} |
|
|
|
if (!quiet) |
|
cerr << endl << "Writing index ... (" << index->size() << " chunks) "; |
|
|
|
startIdx = out.tellp(); |
|
for (size_t i = 0; i < index->size(); i++) |
|
tpt::numwrite(out,(*index)[i]); |
|
out.seekp(0); |
|
idxSize = index->size(); |
|
tpt::numwrite(out, startIdx); |
|
tpt::numwrite(out, idxSize - 1); |
|
tpt::numwrite(out, totalWords); |
|
out.close(); |
|
if (!quiet) cerr << "done" << endl; |
|
return totalWords; |
|
} |
|
|
|
vector<id_type> smap,lmap,pmap,dmap; |
|
|
|
void |
|
invert(vector<id_type> const& from, vector<id_type> & to) |
|
{ |
|
to.resize(from.size()); |
|
for (size_t i = 0 ; i < to.size(); ++i) |
|
to[from[i]] = i; |
|
} |
|
|
|
|
|
|
|
void |
|
conservative_sort(TokenIndex const & V, |
|
vector<size_t> const & cnt, |
|
vector<id_type> & xmap) |
|
{ |
|
xmap.resize(V.totalVocabSize()); |
|
for (size_t i = 0; i < xmap.size(); ++i) xmap[i] = i; |
|
VectorIndexSorter<size_t,greater<size_t>, id_type> sorter(cnt); |
|
sort(xmap.begin()+max(id_type(2),V.knownVocabSize()), xmap.end(), sorter); |
|
} |
|
|
|
|
|
|
|
void remap() |
|
{ |
|
if (!quiet) cerr << "Remapping ids ... "; |
|
filepos_type idxOffset; |
|
id_type totalWords, idxSize; |
|
boost::iostreams::mapped_file mtt(tmpFile); |
|
char const* p = mtt.data(); |
|
p = tpt::numread(p,idxOffset); |
|
p = tpt::numread(p,idxSize); |
|
p = tpt::numread(p,totalWords); |
|
if (is_conll) |
|
{ |
|
vector<size_t> sf(SF.totalVocabSize(), 0); |
|
vector<size_t> lm(LM.totalVocabSize(), 0); |
|
vector<size_t> ps(PS.totalVocabSize(), 0); |
|
vector<size_t> dt(DT.totalVocabSize(), 0); |
|
Conll_Record* w = reinterpret_cast<Conll_Record*>(const_cast<char*>(p)); |
|
for (size_t i = 0; i < totalWords; ++i) |
|
{ |
|
++sf.at(w[i].sform); |
|
++lm.at(w[i].lemma); |
|
++ps.at(w[i].majpos); |
|
++ps.at(w[i].minpos); |
|
++dt.at(w[i].dtype); |
|
} |
|
conservative_sort(SF,sf,smap); |
|
conservative_sort(LM,lm,lmap); |
|
conservative_sort(PS,ps,pmap); |
|
conservative_sort(DT,dt,dmap); |
|
vector<id_type> smap_i(smap.size()); invert(smap,smap_i); |
|
vector<id_type> lmap_i(lmap.size()); invert(lmap,lmap_i); |
|
vector<id_type> pmap_i(pmap.size()); invert(pmap,pmap_i); |
|
vector<id_type> dmap_i(dmap.size()); invert(dmap,dmap_i); |
|
for (size_t i = 0; i < totalWords; ++i) |
|
{ |
|
w[i].sform = smap_i[w[i].sform]; |
|
w[i].lemma = lmap_i[w[i].lemma]; |
|
w[i].majpos = pmap_i[w[i].majpos]; |
|
w[i].minpos = pmap_i[w[i].minpos]; |
|
w[i].dtype = dmap_i[w[i].dtype]; |
|
} |
|
} |
|
else |
|
{ |
|
vector<size_t> sf(SF.totalVocabSize(), 0); |
|
id_type* w = reinterpret_cast<id_type*>(const_cast<char*>(p)); |
|
for (size_t i = 0; i < totalWords; ++i) ++sf.at(w[i]); |
|
conservative_sort(SF,sf,smap); |
|
vector<id_type> smap_i(smap.size()); invert(smap,smap_i); |
|
for (size_t i = 0; i < totalWords; ++i) w[i] = smap_i[w[i]]; |
|
} |
|
mtt.close(); |
|
if (!quiet) cerr << "done." << endl; |
|
} |
|
|
|
void save_vocabs() |
|
{ |
|
string vbase = baseName; |
|
if (is_conll) |
|
{ |
|
if (SF.totalVocabSize() > SF.knownVocabSize()) |
|
write_tokenindex(vbase+".tdx.sfo",SF,smap); |
|
if (LM.totalVocabSize() > LM.knownVocabSize()) |
|
write_tokenindex(vbase+".tdx.lem",LM,lmap); |
|
if (PS.totalVocabSize() > PS.knownVocabSize()) |
|
write_tokenindex(vbase+".tdx.pos",PS,pmap); |
|
if (DT.totalVocabSize() > DT.knownVocabSize()) |
|
write_tokenindex(vbase+".tdx.drl",DT,dmap); |
|
} |
|
else if (SF.totalVocabSize() > SF.knownVocabSize()) |
|
write_tokenindex(vbase+".tdx",SF,smap); |
|
} |
|
|
|
template<typename Token> |
|
void |
|
build_mmTSA(string infile, string outfile) |
|
{ |
|
|
|
|
|
boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile)); |
|
bdBitset filter; |
|
filter.resize(T->size(),true); |
|
imTSA<Token> S(T,&filter,(quiet?NULL:&cerr)); |
|
S.save_as_mm_tsa(outfile); |
|
|
|
} |
|
|
|
bool |
|
build_plaintext_tsas() |
|
{ |
|
typedef L2R_Token<SimpleWordId> L2R; |
|
typedef R2L_Token<SimpleWordId> R2L; |
|
|
|
if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa"); |
|
if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa"); |
|
|
|
return true; |
|
} |
|
|
|
void build_conll_tsas() |
|
{ |
|
string bn = baseName; |
|
string mtt = tmpFile; |
|
size_t c = 3 * (with_sfas + with_pfas + with_dcas); |
|
if (with_sfas) |
|
{ |
|
build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform"); |
|
build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma"); |
|
build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos"); |
|
} |
|
|
|
if (with_pfas) |
|
{ |
|
build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform"); |
|
build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma"); |
|
build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos"); |
|
} |
|
|
|
if (with_dcas) |
|
{ |
|
build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform"); |
|
build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma"); |
|
build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos"); |
|
} |
|
|
|
} |
|
|
|
|
|
int main(int argc, char* argv[]) |
|
{ |
|
init(argc,argv); |
|
numberize(); |
|
if (SF.totalVocabSize() > SF.knownVocabSize() || |
|
LM.totalVocabSize() > LM.knownVocabSize() || |
|
PS.totalVocabSize() > PS.knownVocabSize() || |
|
DT.totalVocabSize() > DT.knownVocabSize()) |
|
{ |
|
remap(); |
|
save_vocabs(); |
|
} |
|
if (is_conll) build_conll_tsas(); |
|
else build_plaintext_tsas(); |
|
if (!quiet) cerr << endl; |
|
rename(tmpFile.c_str(),mttFile.c_str()); |
|
} |
|
|
|
void |
|
interpret_args(int ac, char* av[]) |
|
{ |
|
po::variables_map vm; |
|
po::options_description o("Options"); |
|
o.add_options() |
|
|
|
("help,h", "print this message") |
|
|
|
("quiet,q", po::bool_switch(&quiet), |
|
"don't print progress information") |
|
|
|
("incremental,i", po::bool_switch(&incremental), |
|
"incremental mode; rewrites vocab files!") |
|
|
|
("vocab-base,v", po::value<string>(&vocabBase), |
|
"base name of various vocabularies") |
|
|
|
("output,o", po::value<string>(&baseName), |
|
"base file name of the resulting file(s)") |
|
|
|
("sfa,s", po::value<int>(&with_sfas)->default_value(1), |
|
"also build suffix arrays") |
|
|
|
("pfa,p", po::value<int>(&with_pfas) |
|
->default_value(0)->implicit_value(1), |
|
"also build prefix arrays") |
|
|
|
("dca,d", po::value<int>(&with_dcas) |
|
->default_value(0)->implicit_value(1), |
|
"also build dependency chain arrays") |
|
|
|
("conll,c", po::bool_switch(&is_conll), |
|
"corpus is in CoNLL format (default: plain text)") |
|
|
|
("unk,u", po::value<string>(&UNK)->default_value("UNK"), |
|
"label for unknown tokens") |
|
|
|
|
|
|
|
|
|
; |
|
|
|
po::options_description h("Hidden Options"); |
|
h.add_options() |
|
; |
|
h.add(o); |
|
po::positional_options_description a; |
|
a.add("output",1); |
|
|
|
po::store(po::command_line_parser(ac,av) |
|
.options(h) |
|
.positional(a) |
|
.run(),vm); |
|
po::notify(vm); |
|
if (vm.count("help") || !vm.count("output")) |
|
{ |
|
cout << "\nusage:\n\t cat <corpus> | " << av[0] |
|
<< " [options] <output .mtt file>" << endl; |
|
cout << o << endl; |
|
exit(0); |
|
} |
|
mttFile = baseName + (is_conll ? ".mtt" : ".mct"); |
|
tmpFile = mttFile + "_"; |
|
} |
|
|