|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "AlignmentGraph.h" |
|
|
|
#include <algorithm> |
|
#include <cassert> |
|
#include <cstdlib> |
|
#include <memory> |
|
#include <stack> |
|
|
|
#include "SyntaxTree.h" |
|
|
|
#include "ComposedRule.h" |
|
#include "Node.h" |
|
#include "Options.h" |
|
#include "Subgraph.h" |
|
|
|
namespace MosesTraining |
|
{ |
|
namespace Syntax |
|
{ |
|
namespace GHKM |
|
{ |
|
|
|
AlignmentGraph::AlignmentGraph(const SyntaxTree *t, |
|
const std::vector<std::string> &s, |
|
const Alignment &a) |
|
{ |
|
|
|
m_root = CopyParseTree(t); |
|
|
|
|
|
m_sourceNodes.reserve(s.size()); |
|
for (std::vector<std::string>::const_iterator p(s.begin()); |
|
p != s.end(); ++p) { |
|
m_sourceNodes.push_back(new Node(*p, SOURCE)); |
|
} |
|
|
|
|
|
|
|
std::vector<Node *> targetTreeLeaves; |
|
GetTargetTreeLeaves(m_root, targetTreeLeaves); |
|
for (Alignment::const_iterator p(a.begin()); p != a.end(); ++p) { |
|
Node *src = m_sourceNodes[p->first]; |
|
Node *tgt = targetTreeLeaves[p->second]; |
|
src->AddParent(tgt); |
|
tgt->AddChild(src); |
|
} |
|
|
|
|
|
AttachUnalignedSourceWords(); |
|
|
|
|
|
std::vector<Node *>::const_iterator p(m_sourceNodes.begin()); |
|
for (int i = 0; p != m_sourceNodes.end(); ++p, ++i) { |
|
(*p)->PropagateIndex(i); |
|
} |
|
|
|
|
|
CalcComplementSpans(m_root); |
|
} |
|
|
|
AlignmentGraph::~AlignmentGraph() |
|
{ |
|
for (std::vector<Node *>::iterator p(m_sourceNodes.begin()); |
|
p != m_sourceNodes.end(); ++p) { |
|
delete *p; |
|
} |
|
for (std::vector<Node *>::iterator p(m_targetNodes.begin()); |
|
p != m_targetNodes.end(); ++p) { |
|
delete *p; |
|
} |
|
} |
|
|
|
Subgraph AlignmentGraph::ComputeMinimalFrontierGraphFragment( |
|
Node *root, |
|
const std::set<Node *> &frontierSet) |
|
{ |
|
std::stack<Node *> expandableNodes; |
|
std::set<const Node *> expandedNodes; |
|
|
|
if (root->IsSink()) { |
|
expandedNodes.insert(root); |
|
} else { |
|
expandableNodes.push(root); |
|
} |
|
|
|
while (!expandableNodes.empty()) { |
|
Node *n = expandableNodes.top(); |
|
expandableNodes.pop(); |
|
|
|
const std::vector<Node *> &children = n->GetChildren(); |
|
|
|
for (std::vector<Node *>::const_iterator p(children.begin()); |
|
p != children.end(); ++p) { |
|
Node *child = *p; |
|
if (child->IsSink()) { |
|
expandedNodes.insert(child); |
|
continue; |
|
} |
|
std::set<Node *>::const_iterator q = frontierSet.find(child); |
|
if (q == frontierSet.end()) { |
|
expandableNodes.push(child); |
|
} else if (child->GetType() == TARGET) { |
|
expandableNodes.push(child); |
|
} else { |
|
expandedNodes.insert(child); |
|
} |
|
} |
|
} |
|
|
|
return Subgraph(root, expandedNodes); |
|
} |
|
|
|
void AlignmentGraph::ExtractMinimalRules(const Options &options) |
|
{ |
|
|
|
std::set<Node *> frontierSet; |
|
ComputeFrontierSet(m_root, options, frontierSet); |
|
|
|
|
|
std::vector<Subgraph> fragments; |
|
fragments.reserve(frontierSet.size()); |
|
for (std::set<Node *>::iterator p(frontierSet.begin()); |
|
p != frontierSet.end(); ++p) { |
|
Node *root = *p; |
|
Subgraph fragment = ComputeMinimalFrontierGraphFragment(root, frontierSet); |
|
assert(!fragment.IsTrivial()); |
|
|
|
|
|
if (root->GetType() == TREE && !root->GetSpan().empty()) { |
|
root->AddRule(new Subgraph(fragment)); |
|
} |
|
} |
|
} |
|
|
|
void AlignmentGraph::ExtractComposedRules(const Options &options) |
|
{ |
|
ExtractComposedRules(m_root, options); |
|
} |
|
|
|
void AlignmentGraph::ExtractComposedRules(Node *node, const Options &options) |
|
{ |
|
|
|
const std::vector<Node *> &children = node->GetChildren(); |
|
for (std::vector<Node *>::const_iterator p(children.begin()); |
|
p != children.end(); ++p) { |
|
ExtractComposedRules(*p, options); |
|
} |
|
|
|
|
|
|
|
const std::vector<const Subgraph*> &rules = node->GetRules(); |
|
assert(rules.size() <= 1); |
|
if (rules.empty()) { |
|
return; |
|
} |
|
|
|
|
|
ComposedRule cr(*(rules[0])); |
|
if (!cr.GetOpenAttachmentPoint()) { |
|
|
|
return; |
|
} |
|
|
|
std::queue<ComposedRule> queue; |
|
queue.push(cr); |
|
while (!queue.empty()) { |
|
ComposedRule cr = queue.front(); |
|
queue.pop(); |
|
const Node *attachmentPoint = cr.GetOpenAttachmentPoint(); |
|
assert(attachmentPoint); |
|
assert(attachmentPoint != node); |
|
|
|
|
|
|
|
const std::vector<const Subgraph*> &rules = attachmentPoint->GetRules(); |
|
for (std::vector<const Subgraph*>::const_iterator p = rules.begin(); |
|
p != rules.end(); ++p) { |
|
assert((*p)->GetRoot()->GetType() == TREE); |
|
ComposedRule *cr2 = cr.AttemptComposition(**p, options); |
|
if (cr2) { |
|
node->AddRule(new Subgraph(cr2->CreateSubgraph())); |
|
if (cr2->GetOpenAttachmentPoint()) { |
|
queue.push(*cr2); |
|
} |
|
delete cr2; |
|
} |
|
} |
|
|
|
cr.CloseAttachmentPoint(); |
|
if (cr.GetOpenAttachmentPoint()) { |
|
queue.push(cr); |
|
} |
|
} |
|
} |
|
|
|
Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) |
|
{ |
|
NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE; |
|
|
|
std::auto_ptr<Node> n(new Node(root->value().label, nodeType)); |
|
|
|
if (nodeType == TREE) { |
|
float score = 0.0f; |
|
SyntaxNode::AttributeMap::const_iterator p = |
|
root->value().attributes.find("pcfg"); |
|
if (p != root->value().attributes.end()) { |
|
score = std::atof(p->second.c_str()); |
|
} |
|
n->SetPcfgScore(score); |
|
} |
|
|
|
const std::vector<SyntaxTree *> &children = root->children(); |
|
std::vector<Node *> childNodes; |
|
childNodes.reserve(children.size()); |
|
for (std::vector<SyntaxTree *>::const_iterator p(children.begin()); |
|
p != children.end(); ++p) { |
|
Node *child = CopyParseTree(*p); |
|
child->AddParent(n.get()); |
|
childNodes.push_back(child); |
|
} |
|
n->SetChildren(childNodes); |
|
|
|
Node *p = n.release(); |
|
m_targetNodes.push_back(p); |
|
return p; |
|
} |
|
|
|
|
|
|
|
void AlignmentGraph::ComputeFrontierSet(Node *root, |
|
const Options &options, |
|
std::set<Node *> &frontierSet) const |
|
{ |
|
|
|
|
|
|
|
if (root->GetType() != TREE || root->GetSpan().empty()) { |
|
return; |
|
} |
|
|
|
if (IsFrontierNode(*root, options)) { |
|
frontierSet.insert(root); |
|
} |
|
|
|
|
|
const std::vector<Node *> &children = root->GetChildren(); |
|
for (std::vector<Node *>::const_iterator p(children.begin()); |
|
p != children.end(); ++p) { |
|
ComputeFrontierSet(*p, options, frontierSet); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool AlignmentGraph::IsFrontierNode(const Node &n, const Options &options) const |
|
{ |
|
|
|
if (n.GetType() != TREE || n.GetSpan().empty()) { |
|
return false; |
|
} |
|
|
|
if (SpansIntersect(n.GetComplementSpan(), Closure(n.GetSpan()))) { |
|
return false; |
|
} |
|
|
|
|
|
|
|
assert(n.GetParents().size() <= 1); |
|
if (!options.allowUnary && |
|
!n.GetParents().empty() && |
|
n.GetParents()[0]->GetSpan() == n.GetSpan()) { |
|
return false; |
|
} |
|
return true; |
|
} |
|
|
|
void AlignmentGraph::CalcComplementSpans(Node *root) |
|
{ |
|
Span compSpan; |
|
std::set<Node *> siblings; |
|
|
|
const std::vector<Node *> &parents = root->GetParents(); |
|
for (std::vector<Node *>::const_iterator p(parents.begin()); |
|
p != parents.end(); ++p) { |
|
const Span &parentCompSpan = (*p)->GetComplementSpan(); |
|
compSpan.insert(parentCompSpan.begin(), parentCompSpan.end()); |
|
const std::vector<Node *> &c = (*p)->GetChildren(); |
|
siblings.insert(c.begin(), c.end()); |
|
} |
|
|
|
for (std::set<Node *>::iterator p(siblings.begin()); |
|
p != siblings.end(); ++p) { |
|
if (*p == root) { |
|
continue; |
|
} |
|
const Span &siblingSpan = (*p)->GetSpan(); |
|
compSpan.insert(siblingSpan.begin(), siblingSpan.end()); |
|
} |
|
|
|
root->SetComplementSpan(compSpan); |
|
|
|
const std::vector<Node *> &children = root->GetChildren(); |
|
for (std::vector<Node *>::const_iterator p(children.begin()); |
|
p != children.end(); ++p) { |
|
CalcComplementSpans(*p); |
|
} |
|
} |
|
|
|
void AlignmentGraph::GetTargetTreeLeaves(Node *root, |
|
std::vector<Node *> &leaves) |
|
{ |
|
if (root->IsSink()) { |
|
leaves.push_back(root); |
|
} else { |
|
const std::vector<Node *> &children = root->GetChildren(); |
|
for (std::vector<Node *>::const_iterator p(children.begin()); |
|
p != children.end(); ++p) { |
|
GetTargetTreeLeaves(*p, leaves); |
|
} |
|
} |
|
} |
|
|
|
void AlignmentGraph::AttachUnalignedSourceWords() |
|
{ |
|
|
|
std::set<int> unaligned; |
|
for (size_t i = 0; i < m_sourceNodes.size(); ++i) { |
|
const Node &sourceNode = (*m_sourceNodes[i]); |
|
if (sourceNode.GetParents().empty()) { |
|
unaligned.insert(i); |
|
} |
|
} |
|
|
|
|
|
for (std::set<int>::iterator p = unaligned.begin(); |
|
p != unaligned.end(); ++p) { |
|
int index = *p; |
|
Node *attachmentPoint = DetermineAttachmentPoint(index); |
|
Node *sourceNode = m_sourceNodes[index]; |
|
attachmentPoint->AddChild(sourceNode); |
|
sourceNode->AddParent(attachmentPoint); |
|
} |
|
} |
|
|
|
Node *AlignmentGraph::DetermineAttachmentPoint(int index) |
|
{ |
|
|
|
int i = index; |
|
while (--i >= 0) { |
|
if (!m_sourceNodes[i]->GetParents().empty()) { |
|
break; |
|
} |
|
} |
|
|
|
if (i == -1) { |
|
return m_root; |
|
} |
|
|
|
size_t j = index; |
|
while (++j < m_sourceNodes.size()) { |
|
if (!m_sourceNodes[j]->GetParents().empty()) { |
|
break; |
|
} |
|
} |
|
|
|
if (j == m_sourceNodes.size()) { |
|
return m_root; |
|
} |
|
|
|
|
|
const std::vector<Node *> &leftParents = m_sourceNodes[i]->GetParents(); |
|
assert(!leftParents.empty()); |
|
const std::vector<Node *> &rightParents = m_sourceNodes[j]->GetParents(); |
|
assert(!rightParents.empty()); |
|
std::set<Node *> targetSet; |
|
targetSet.insert(leftParents.begin(), leftParents.end()); |
|
targetSet.insert(rightParents.begin(), rightParents.end()); |
|
|
|
|
|
|
|
|
|
Node *lca = Node::LowestCommonAncestor(targetSet.begin(), targetSet.end()); |
|
if (lca->GetType() == TARGET) { |
|
assert(lca->GetParents().size() == 1); |
|
return lca->GetParents()[0]; |
|
} |
|
return lca; |
|
} |
|
|
|
} |
|
} |
|
} |
|
|