|
|
|
#include <iostream> |
|
#include <fstream> |
|
#include <cassert> |
|
#include <vector> |
|
#include <string> |
|
#include "moses/Util.h" |
|
#include "Alignments.h" |
|
|
|
using namespace std; |
|
using namespace Moses; |
|
|
|
inline const std::string TrimInternal(const std::string& str, const std::string dropChars = " \t\n\r") |
|
{ |
|
std::string res = str; |
|
res.erase(str.find_last_not_of(dropChars)+1); |
|
return res.erase(0, res.find_first_not_of(dropChars)); |
|
} |
|
|
|
class CreateXMLRetValues |
|
{ |
|
public: |
|
string frame, ruleS, ruleT, ruleAlignment, ruleAlignmentInv; |
|
}; |
|
|
|
CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path ); |
|
|
|
void create_xml(const string &inPath) |
|
{ |
|
ifstream inStrme(inPath.c_str()); |
|
ofstream rule((inPath + ".extract").c_str()); |
|
ofstream ruleInv((inPath + ".extract.inv").c_str()); |
|
|
|
|
|
|
|
string source, target, align, path; |
|
string *input = NULL; |
|
int count; |
|
|
|
int lineCount = 1; |
|
int ruleCount = 1; |
|
string inLine; |
|
|
|
int step = 0; |
|
while (!inStrme.eof()) { |
|
getline(inStrme, inLine); |
|
|
|
switch (step) { |
|
case 0: |
|
|
|
Scan<int>(inLine); |
|
++step; |
|
break; |
|
case 1: |
|
|
|
Scan<float>(inLine); |
|
++step; |
|
break; |
|
case 2: |
|
source = inLine; |
|
++step; |
|
break; |
|
case 3: |
|
if (input == NULL) { |
|
input = new string(inLine); |
|
} else { |
|
assert(inLine == *input); |
|
} |
|
++step; |
|
break; |
|
case 4: |
|
target = inLine; |
|
++step; |
|
break; |
|
case 5: |
|
align = inLine; |
|
++step; |
|
break; |
|
case 6: |
|
path = inLine + "X"; |
|
++step; |
|
break; |
|
case 7: |
|
count = Scan<int>(inLine); |
|
CreateXMLRetValues ret = createXML(ruleCount, source, *input, target, align, path); |
|
|
|
|
|
rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment |
|
<< " ||| " << count << endl; |
|
ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv |
|
<< " ||| " << count << endl; |
|
|
|
|
|
++ruleCount; |
|
step = 0; |
|
break; |
|
} |
|
|
|
++lineCount; |
|
} |
|
|
|
delete input; |
|
ruleInv.close(); |
|
rule.close(); |
|
inStrme.close(); |
|
|
|
} |
|
|
|
|
|
CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path) |
|
{ |
|
CreateXMLRetValues ret; |
|
vector<string> sourceToks = Tokenize(source, " ") |
|
,inputToks = Tokenize(input, " ") |
|
,targetsToks = Tokenize(target, " "); |
|
Alignments alignments(align, sourceToks.size(), targetsToks.size()); |
|
map<int, string> frameInput; |
|
map<int, int> alignI2S; |
|
vector< map<string, int> > nonTerms; |
|
vector<bool> targetBitmap(targetsToks.size(), true); |
|
vector<bool> inputBitmap; |
|
|
|
|
|
int s = 0, i = 0; |
|
bool currently_matching = false; |
|
int start_s = 0, start_i = 0; |
|
|
|
|
|
for ( int p = 0 ; p < int(path.length()) ; p++ ) { |
|
string action = path.substr(p, 1); |
|
|
|
|
|
if ( currently_matching && action != "M" && action != "X" ) { |
|
start_i = i; |
|
start_s = s; |
|
currently_matching = 0; |
|
} |
|
|
|
else if ( !currently_matching && ( action == "M" || action == "X" ) ) { |
|
|
|
|
|
for ( int ss = start_s ; ss < s ; ss++ ) { |
|
const std::map<int, int> &targets = alignments.m_alignS2T[ss]; |
|
|
|
std::map<int, int>::const_iterator iter; |
|
for (iter = targets.begin(); iter != targets.end(); ++iter) { |
|
int tt = iter->first; |
|
targetBitmap[tt] = 0; |
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
if (start_i < i ) { |
|
|
|
|
|
string insertion = ""; |
|
for (int ii = start_i ; ii < i ; ii++ ) { |
|
insertion += inputToks[ii] + " "; |
|
} |
|
|
|
|
|
|
|
|
|
int start_t = 1000; |
|
for ( int ss = start_s ; ss < s ; ss++ ) { |
|
const std::map<int, int> &targets = alignments.m_alignS2T[ss]; |
|
|
|
std::map<int, int>::const_iterator iter; |
|
for (iter = targets.begin(); iter != targets.end(); ++iter) { |
|
int tt = iter->first; |
|
if (tt < start_t) { |
|
start_t = tt; |
|
} |
|
} |
|
} |
|
|
|
|
|
if ( start_t == 1000 && i > int(inputToks.size()) - 1 ) { |
|
start_t = targetsToks.size() - 1; |
|
} |
|
|
|
|
|
if ( start_t == 1000 ) { |
|
start_t = -1; |
|
for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) { |
|
const std::map<int, int> &targets = alignments.m_alignS2T[ss]; |
|
|
|
std::map<int, int>::const_iterator iter; |
|
for (iter = targets.begin(); iter != targets.end(); ++iter) { |
|
int tt = iter->first; |
|
if (tt > start_t) { |
|
start_t = tt; |
|
} |
|
} |
|
} |
|
} |
|
|
|
frameInput[start_t] += insertion; |
|
map<string, int> nt; |
|
nt["start_t"] = start_t; |
|
nt["start_i"] = start_i; |
|
nonTerms.push_back(nt); |
|
|
|
} |
|
|
|
currently_matching = 1; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ( action != "I" ) { |
|
|
|
|
|
if (s < int(alignments.m_alignS2T.size())) { |
|
const std::map<int, int> &targets = alignments.m_alignS2T[s]; |
|
|
|
|
|
std::map<int, int>::const_iterator iter; |
|
for (iter = targets.begin(); iter != targets.end(); ++iter) { |
|
|
|
|
|
} |
|
} |
|
} |
|
|
|
|
|
if (action != "I") |
|
s++; |
|
if (action != "D") { |
|
i++; |
|
alignI2S[i] = s; |
|
} |
|
|
|
if (action == "M") { |
|
inputBitmap.push_back(1); |
|
} else if (action == "I" || action == "S") { |
|
inputBitmap.push_back(0); |
|
} |
|
|
|
} |
|
|
|
|
|
for (size_t i = 0; i < targetBitmap.size(); ++i) { |
|
|
|
} |
|
|
|
|
|
for (map<int, string>::const_iterator iter = frameInput.begin(); iter != frameInput.end(); ++iter) { |
|
|
|
} |
|
|
|
|
|
|
|
|
|
int rule_pos_s = 0; |
|
map<int, int> ruleAlignS; |
|
|
|
for (int i = 0 ; i < int(inputBitmap.size()) ; ++i ) { |
|
if ( inputBitmap[i] ) { |
|
ret.ruleS += inputToks[i] + " "; |
|
ruleAlignS[ alignI2S[i] ] = rule_pos_s++; |
|
} |
|
|
|
for (size_t j = 0; j < nonTerms.size(); ++j) { |
|
map<string, int> &nt = nonTerms[j]; |
|
if (i == nt["start_i"]) { |
|
ret.ruleS += "[X][X] "; |
|
nt["rule_pos_s"] = rule_pos_s++; |
|
} |
|
} |
|
} |
|
|
|
int rule_pos_t = 0; |
|
map<int, int> ruleAlignT; |
|
|
|
for (int t = -1 ; t < (int) targetBitmap.size(); t++ ) { |
|
if (t >= 0 && targetBitmap[t]) { |
|
ret.ruleT += targetsToks[t] + " "; |
|
ruleAlignT[t] = rule_pos_t++; |
|
} |
|
|
|
for (size_t i = 0; i < nonTerms.size(); ++i) { |
|
map<string, int> &nt = nonTerms[i]; |
|
|
|
if (t == nt["start_t"]) { |
|
ret.ruleT += "[X][X] "; |
|
nt["rule_pos_t"] = rule_pos_t++; |
|
} |
|
} |
|
} |
|
|
|
int numAlign = 0; |
|
ret.ruleAlignment = ""; |
|
|
|
for (map<int, int>::const_iterator iter = ruleAlignS.begin(); iter != ruleAlignS.end(); ++iter) { |
|
int s = iter->first; |
|
|
|
if (s < int(alignments.m_alignS2T.size())) { |
|
const std::map<int, int> &targets = alignments.m_alignS2T[s]; |
|
|
|
std::map<int, int>::const_iterator iter; |
|
for (iter = targets.begin(); iter != targets.end(); ++iter) { |
|
int t =iter->first; |
|
if (ruleAlignT.find(t) == ruleAlignT.end()) |
|
continue; |
|
ret.ruleAlignment += SPrint(ruleAlignS[s]) + "-" + SPrint(ruleAlignT[t]) + " "; |
|
++numAlign; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
for (size_t i = 0; i < nonTerms.size(); ++i) { |
|
map<string, int> &nt = nonTerms[i]; |
|
ret.ruleAlignment += SPrint(nt["rule_pos_s"]) + "-" + SPrint(nt["rule_pos_t"]) + " "; |
|
++numAlign; |
|
} |
|
|
|
|
|
|
|
ret.ruleS = TrimInternal(ret.ruleS); |
|
ret.ruleT = TrimInternal(ret.ruleT); |
|
ret.ruleAlignment = TrimInternal(ret.ruleAlignment); |
|
|
|
vector<string> ruleAlignmentToks = Tokenize(ret.ruleAlignment); |
|
for (size_t i = 0; i < ruleAlignmentToks.size(); ++i) { |
|
const string &alignPoint = ruleAlignmentToks[i]; |
|
vector<string> toks = Tokenize(alignPoint, "-"); |
|
assert(toks.size() == 2); |
|
ret.ruleAlignmentInv += toks[1] + "-" +toks[0]; |
|
} |
|
ret.ruleAlignmentInv = TrimInternal(ret.ruleAlignmentInv); |
|
|
|
|
|
|
|
if (frameInput.find(-1) == frameInput.end()) |
|
ret.frame = frameInput[-1]; |
|
|
|
int currently_included = 0; |
|
int start_t = -1; |
|
targetBitmap.push_back(0); |
|
|
|
for (size_t t = 0 ; t <= targetsToks.size() ; t++ ) { |
|
|
|
if ( !currently_included && targetBitmap[t] ) { |
|
start_t = t; |
|
currently_included = 1; |
|
} |
|
|
|
else if (currently_included |
|
&& ( targetBitmap[t] || frameInput.find(t) != frameInput.end() ) |
|
) { |
|
|
|
if ( start_t >= 0 ) { |
|
string target = ""; |
|
|
|
for (size_t tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) { |
|
target += targetsToks[tt] + " "; |
|
} |
|
|
|
ret.frame += "<xml translation=\"" + target + "\"> x </xml> "; |
|
} |
|
currently_included = 0; |
|
} |
|
|
|
if (frameInput.find(t) != frameInput.end()) |
|
ret.frame += frameInput[t]; |
|
|
|
|
|
|
|
} |
|
|
|
cerr << ret.frame << "\n-------------------------------------\n"; |
|
return ret; |
|
|
|
} |
|
|
|
|
|
|
|
|