NMTKD
/
translation
/tools
/mosesdecoder
/phrase-extract
/extract-mixed-syntax
/AlignedSentenceSyntax.cpp
/* | |
* AlignedSentenceSyntax.cpp | |
* | |
* Created on: 26 Feb 2014 | |
* Author: hieu | |
*/ | |
using namespace std; | |
AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum, | |
const std::string &source, | |
const std::string &target, | |
const std::string &alignment) | |
:AlignedSentence(lineNum) | |
,m_sourceStr(source) | |
,m_targetStr(target) | |
,m_alignmentStr(alignment) | |
{ | |
} | |
AlignedSentenceSyntax::~AlignedSentenceSyntax() | |
{ | |
// TODO Auto-generated destructor stub | |
} | |
void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter ¶ms, | |
string line, Phrase &phrase, SyntaxTree &tree) | |
{ | |
// parse source and target string | |
if (isSyntax) { | |
line = "<xml><tree label=\"X\">" + line + "</tree></xml>"; | |
XMLParse(phrase, tree, line, params); | |
if (mixedSyntaxType != 0) { | |
// mixed syntax. Always add [X] where there isn't 1 | |
tree.SetHieroLabel(params.hieroNonTerm); | |
if (mixedSyntaxType == 2) { | |
tree.AddToAll(params.hieroNonTerm); | |
} | |
} | |
} else { | |
PopulateWordVec(phrase, line); | |
tree.SetHieroLabel(params.hieroNonTerm); | |
} | |
} | |
void AlignedSentenceSyntax::Create(const Parameter ¶ms) | |
{ | |
Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr, | |
m_source, m_sourceTree); | |
Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr, | |
m_target, m_targetTree); | |
PopulateAlignment(m_alignmentStr); | |
CreateConsistentPhrases(params); | |
// create labels | |
CreateNonTerms(); | |
} | |
void Escape(string &text) | |
{ | |
text = Moses::Replace(text, "&", "&"); | |
text = Moses::Replace(text, "|", "|"); | |
text = Moses::Replace(text, "<", "<"); | |
text = Moses::Replace(text, ">", ">"); | |
text = Moses::Replace(text, "'", "'"); | |
text = Moses::Replace(text, "\"", """); | |
text = Moses::Replace(text, "[", "["); | |
text = Moses::Replace(text, "]", "]"); | |
} | |
void AlignedSentenceSyntax::XMLParse(Phrase &output, | |
SyntaxTree &tree, | |
const pugi::xml_node &parentNode, | |
const Parameter ¶ms) | |
{ | |
int childNum = 0; | |
for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) { | |
string nodeName = childNode.name(); | |
// span label | |
string label; | |
int startPos = output.size(); | |
if (!nodeName.empty()) { | |
pugi::xml_attribute attribute = childNode.attribute("label"); | |
label = attribute.as_string(); | |
// recursively call this function. For proper recursive trees | |
XMLParse(output, tree, childNode, params); | |
} | |
// fill phrase vector | |
string text = childNode.value(); | |
Escape(text); | |
//cerr << childNum << " " << label << "=" << text << endl; | |
std::vector<string> toks; | |
Moses::Tokenize(toks, text); | |
for (size_t i = 0; i < toks.size(); ++i) { | |
const string &tok = toks[i]; | |
Word *word = new Word(output.size(), tok); | |
output.push_back(word); | |
} | |
// is it a labelled span? | |
int endPos = output.size() - 1; | |
// fill syntax labels | |
if (!label.empty()) { | |
label = "[" + label + "]"; | |
tree.Add(startPos, endPos, label, params); | |
} | |
++childNum; | |
} | |
} | |
void AlignedSentenceSyntax::XMLParse(Phrase &output, | |
SyntaxTree &tree, | |
const std::string input, | |
const Parameter ¶ms) | |
{ | |
pugi::xml_document doc; | |
pugi::xml_parse_result result = doc.load(input.c_str(), | |
pugi::parse_default | pugi::parse_comments); | |
pugi::xml_node topNode = doc.child("xml"); | |
XMLParse(output, tree, topNode, params); | |
} | |
void AlignedSentenceSyntax::CreateNonTerms() | |
{ | |
for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) { | |
for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) { | |
ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd); | |
const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd); | |
ConsistentPhrases::Coll::iterator iter; | |
for (iter = coll.begin(); iter != coll.end(); ++iter) { | |
ConsistentPhrase &cp = **iter; | |
int targetStart = cp.corners[2]; | |
int targetEnd = cp.corners[3]; | |
const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd); | |
CreateNonTerms(cp, sourceLabels, targetLabels); | |
} | |
} | |
} | |
} | |
void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp, | |
const SyntaxTree::Labels &sourceLabels, | |
const SyntaxTree::Labels &targetLabels) | |
{ | |
SyntaxTree::Labels::const_iterator iterSource; | |
for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) { | |
const string &sourceLabel = *iterSource; | |
SyntaxTree::Labels::const_iterator iterTarget; | |
for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) { | |
const string &targetLabel = *iterTarget; | |
cp.AddNonTerms(sourceLabel, targetLabel); | |
} | |
} | |
} | |