#include "xml_tree_parser.h" #include #include #include "util/tokenize.hh" #include "SyntaxTree.h" #include "tables-core.h" #include "XmlException.h" #include "XmlTree.h" #include "exception.h" namespace MosesTraining { namespace Syntax { std::auto_ptr XmlTreeParser::Parse(const std::string &line, bool unescape) { sentence_ = line; node_collection_.Clear(); try { if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_, top_label_set_, unescape)) { throw Exception(""); } } catch (const XmlException &e) { throw Exception(e.getMsg()); } std::auto_ptr root = node_collection_.ExtractTree(); words_ = util::tokenize(sentence_); AttachWords(words_, *root); return root; } void XmlTreeParser::AttachWords(const std::vector &words, SyntaxTree &root) { std::vector leaves; leaves.reserve(words.size()); for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) { leaves.push_back(&*p); } std::vector::const_iterator q = words.begin(); for (std::vector::iterator p = leaves.begin(); p != leaves.end(); ++p) { SyntaxTree *leaf = *p; const int start = leaf->value().start; const int end = leaf->value().end; if (start != end) { std::ostringstream msg; msg << "leaf node covers multiple words (" << start << "-" << end << "): this is currently unsupported"; throw Exception(msg.str()); } SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end)); leaf->children().push_back(newLeaf); newLeaf->parent() = leaf; } } } // namespace Syntax } // namespace MosesTraining