|
#include "xml_tree_parser.h" |
|
|
|
#include <cassert> |
|
#include <vector> |
|
|
|
#include "util/tokenize.hh" |
|
|
|
#include "SyntaxTree.h" |
|
#include "tables-core.h" |
|
#include "XmlException.h" |
|
#include "XmlTree.h" |
|
|
|
#include "exception.h" |
|
|
|
namespace MosesTraining { |
|
namespace Syntax { |
|
|
|
std::auto_ptr<SyntaxTree> XmlTreeParser::Parse(const std::string &line, |
|
bool unescape) |
|
{ |
|
sentence_ = line; |
|
node_collection_.Clear(); |
|
try { |
|
if (!ProcessAndStripXMLTags(sentence_, node_collection_, label_set_, |
|
top_label_set_, unescape)) { |
|
throw Exception(""); |
|
} |
|
} catch (const XmlException &e) { |
|
throw Exception(e.getMsg()); |
|
} |
|
std::auto_ptr<SyntaxTree> root = node_collection_.ExtractTree(); |
|
words_ = util::tokenize(sentence_); |
|
AttachWords(words_, *root); |
|
return root; |
|
} |
|
|
|
void XmlTreeParser::AttachWords(const std::vector<std::string> &words, |
|
SyntaxTree &root) |
|
{ |
|
std::vector<SyntaxTree*> leaves; |
|
leaves.reserve(words.size()); |
|
for (SyntaxTree::LeafIterator p(root); p != SyntaxTree::LeafIterator(); ++p) { |
|
leaves.push_back(&*p); |
|
} |
|
|
|
std::vector<std::string>::const_iterator q = words.begin(); |
|
for (std::vector<SyntaxTree*>::iterator p = leaves.begin(); p != leaves.end(); |
|
++p) { |
|
SyntaxTree *leaf = *p; |
|
const int start = leaf->value().start; |
|
const int end = leaf->value().end; |
|
if (start != end) { |
|
std::ostringstream msg; |
|
msg << "leaf node covers multiple words (" << start << "-" << end |
|
<< "): this is currently unsupported"; |
|
throw Exception(msg.str()); |
|
} |
|
SyntaxTree *newLeaf = new SyntaxTree(SyntaxNode(*q++, start, end)); |
|
leaf->children().push_back(newLeaf); |
|
newLeaf->parent() = leaf; |
|
} |
|
} |
|
|
|
} |
|
} |
|
|