File size: 3,072 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
#include "SplitPoint.h"
#include <map>
#include <set>
#include <sstream>
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "syntax-common/exception.h"
namespace MosesTraining
{
namespace Syntax
{
namespace PostprocessEgretForests
{
void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints,
std::string &sentence)
{
if (splitPoints.empty()) {
return;
}
// FIXME Assumes all split points have same connector
std::string connector;
std::map<int, std::set<int> > points;
for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
p != splitPoints.end(); ++p) {
points[p->tokenPos].insert(p->charPos);
connector = p->connector;
}
// Split the sentence in to a sequence of tokens.
std::vector<std::string> terminals;
const util::AnyCharacter delim(" \t");
for (util::TokenIter<util::AnyCharacter, true> p(sentence, delim); p; ++p) {
terminals.resize(terminals.size()+1);
p->CopyToString(&terminals.back());
}
// Mark the split points.
for (std::map<int, std::set<int> >::const_iterator p = points.begin();
p != points.end(); ++p) {
std::string &word = terminals[p->first];
int offset = 0;
for (std::set<int>::const_iterator q = p->second.begin();
q != p->second.end(); ++q) {
std::string str = std::string("@") + connector + std::string("@");
word.replace(*q+offset, connector.size(), str);
offset += 2;
}
}
sentence.clear();
for (std::size_t i = 0; i < terminals.size(); ++i) {
if (i > 0) {
sentence += " ";
}
sentence += terminals[i];
}
}
void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints, Forest &forest)
{
if (splitPoints.empty()) {
return;
}
// FIXME Assumes all split points have same connector
std::string connector;
std::map<int, std::set<int> > points;
for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
p != splitPoints.end(); ++p) {
points[p->tokenPos].insert(p->charPos);
connector = p->connector;
}
// Get the terminal vertices in sentence order.
std::vector<Forest::Vertex *> terminals;
for (std::vector<boost::shared_ptr<Forest::Vertex> >::const_iterator
p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
if (!(*p)->incoming.empty()) {
continue;
}
int pos = (*p)->start;
if (pos >= terminals.size()) {
terminals.resize(pos+1);
}
terminals[pos] = p->get();
}
// Mark the split points.
for (std::map<int, std::set<int> >::const_iterator p = points.begin();
p != points.end(); ++p) {
std::string &word = terminals[p->first]->symbol.value;
int offset = 0;
for (std::set<int>::const_iterator q = p->second.begin();
q != p->second.end(); ++q) {
std::string str = std::string("@") + connector + std::string("@");
word.replace(*q+offset, connector.size(), str);
offset += 2;
}
}
}
} // namespace PostprocessEgretForests
} // namespace Syntax
} // namespace MosesTraining
|