|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <sstream> |
|
#include <algorithm> |
|
#include "Rule.h" |
|
#include "AlignedSentence.h" |
|
#include "ConsistentPhrase.h" |
|
#include "NonTerm.h" |
|
#include "Parameter.h" |
|
|
|
using namespace std; |
|
|
|
Rule::Rule(const NonTerm &lhsNonTerm, const AlignedSentence &alignedSentence) |
|
:m_lhs(lhsNonTerm) |
|
,m_alignedSentence(alignedSentence) |
|
,m_isValid(true) |
|
,m_canRecurse(true) |
|
{ |
|
CreateSource(); |
|
} |
|
|
|
Rule::Rule(const Rule ©, const NonTerm &nonTerm) |
|
:m_lhs(copy.m_lhs) |
|
,m_alignedSentence(copy.m_alignedSentence) |
|
,m_nonterms(copy.m_nonterms) |
|
,m_isValid(true) |
|
,m_canRecurse(true) |
|
{ |
|
m_nonterms.push_back(&nonTerm); |
|
CreateSource(); |
|
|
|
} |
|
|
|
Rule::~Rule() |
|
{ |
|
|
|
} |
|
|
|
const ConsistentPhrase &Rule::GetConsistentPhrase() const |
|
{ |
|
return m_lhs.GetConsistentPhrase(); |
|
} |
|
|
|
void Rule::CreateSource() |
|
{ |
|
const NonTerm *cp = NULL; |
|
size_t nonTermInd = 0; |
|
if (nonTermInd < m_nonterms.size()) { |
|
cp = m_nonterms[nonTermInd]; |
|
} |
|
|
|
for (int sourcePos = m_lhs.GetConsistentPhrase().corners[0]; |
|
sourcePos <= m_lhs.GetConsistentPhrase().corners[1]; |
|
++sourcePos) { |
|
|
|
const RuleSymbol *ruleSymbol; |
|
if (cp && cp->GetConsistentPhrase().corners[0] <= sourcePos && sourcePos <= cp->GetConsistentPhrase().corners[1]) { |
|
|
|
ruleSymbol = cp; |
|
sourcePos = cp->GetConsistentPhrase().corners[1]; |
|
if (m_nonterms.size()) { |
|
cp = m_nonterms[nonTermInd]; |
|
} |
|
|
|
|
|
++nonTermInd; |
|
cp = (nonTermInd < m_nonterms.size()) ? m_nonterms[nonTermInd] : NULL; |
|
} else { |
|
|
|
ruleSymbol = m_alignedSentence.GetPhrase(Moses::Input)[sourcePos]; |
|
} |
|
|
|
m_source.Add(ruleSymbol); |
|
} |
|
} |
|
|
|
int Rule::GetNextSourcePosForNonTerm() const |
|
{ |
|
if (m_nonterms.empty()) { |
|
|
|
return m_lhs.GetConsistentPhrase().corners[0]; |
|
} else { |
|
|
|
const ConsistentPhrase &cp = m_nonterms.back()->GetConsistentPhrase(); |
|
int nextPos = cp.corners[1] + 1; |
|
return nextPos; |
|
} |
|
} |
|
|
|
std::string Rule::Debug() const |
|
{ |
|
stringstream out; |
|
|
|
|
|
for (size_t i = 0; i < m_source.GetSize(); ++i) { |
|
const RuleSymbol &symbol = *m_source[i]; |
|
out << symbol.Debug() << " "; |
|
} |
|
|
|
|
|
out << "||| "; |
|
for (size_t i = 0; i < m_target.GetSize(); ++i) { |
|
const RuleSymbol &symbol = *m_target[i]; |
|
out << symbol.Debug() << " "; |
|
} |
|
|
|
out << "||| "; |
|
Alignments::const_iterator iterAlign; |
|
for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) { |
|
const std::pair<int,int> &alignPair = *iterAlign; |
|
out << alignPair.first << "-" << alignPair.second << " "; |
|
} |
|
|
|
|
|
out << "||| LHS=" << m_lhs.Debug(); |
|
|
|
return out.str(); |
|
} |
|
|
|
void Rule::Output(std::ostream &out, bool forward) const |
|
{ |
|
if (forward) { |
|
|
|
m_source.Output(out); |
|
m_lhs.Output(out, Moses::Input); |
|
|
|
out << " ||| "; |
|
|
|
|
|
m_target.Output(out); |
|
m_lhs.Output(out, Moses::Output); |
|
} else { |
|
|
|
m_target.Output(out); |
|
m_lhs.Output(out, Moses::Output); |
|
|
|
out << " ||| "; |
|
|
|
|
|
m_source.Output(out); |
|
m_lhs.Output(out, Moses::Input); |
|
} |
|
|
|
out << " ||| "; |
|
|
|
|
|
Alignments::const_iterator iterAlign; |
|
for (iterAlign = m_alignments.begin(); iterAlign != m_alignments.end(); ++iterAlign) { |
|
const std::pair<int,int> &alignPair = *iterAlign; |
|
|
|
if (forward) { |
|
out << alignPair.first << "-" << alignPair.second << " "; |
|
} else { |
|
out << alignPair.second << "-" << alignPair.first << " "; |
|
} |
|
} |
|
|
|
out << "||| "; |
|
|
|
|
|
out << m_count; |
|
|
|
out << " ||| "; |
|
|
|
|
|
if (forward) { |
|
for (size_t i = 0; i < m_properties.size(); ++i) { |
|
const Property &prop = m_properties[i]; |
|
out << prop << " "; |
|
} |
|
} |
|
} |
|
|
|
void Rule::NonTermContextFactor(int factor, const Word &word, std::ostream &out) const |
|
{ |
|
out << word.GetString(factor) << " "; |
|
} |
|
|
|
void Rule::NonTermContext(int sourceTarget, int factor, size_t ntInd, const ConsistentPhrase &cp, std::ostream &out) const |
|
{ |
|
int startPos, endPos; |
|
const Phrase *phrase; |
|
|
|
if (sourceTarget == 1) { |
|
startPos = cp.corners[0]; |
|
endPos = cp.corners[1]; |
|
phrase = &m_alignedSentence.GetPhrase(Moses::Input); |
|
} else if (sourceTarget == 2) { |
|
startPos = cp.corners[2]; |
|
endPos = cp.corners[3]; |
|
phrase = &m_alignedSentence.GetPhrase(Moses::Output); |
|
} else { |
|
abort(); |
|
} |
|
|
|
out << ntInd << " "; |
|
|
|
|
|
if (startPos == 0) { |
|
out << "<s> "; |
|
} else { |
|
NonTermContextFactor(factor, *phrase->at(startPos - 1), out); |
|
} |
|
|
|
|
|
NonTermContextFactor(factor, *phrase->at(startPos), out); |
|
|
|
|
|
NonTermContextFactor(factor, *phrase->at(endPos), out); |
|
|
|
|
|
if (endPos == phrase->size() - 1) { |
|
out << "</s> "; |
|
} else { |
|
NonTermContextFactor(factor, *phrase->at(endPos + 1), out); |
|
} |
|
|
|
|
|
} |
|
|
|
void Rule::Prevalidate(const Parameter ¶ms) |
|
{ |
|
|
|
if (m_source.GetSize() > params.maxSymbolsSource) { |
|
m_isValid = false; |
|
} |
|
|
|
|
|
if (m_nonterms.size()) { |
|
const NonTerm &lastNonTerm = *m_nonterms.back(); |
|
const ConsistentPhrase &cp = lastNonTerm.GetConsistentPhrase(); |
|
|
|
int sourceWidth = cp.GetWidth(Moses::Input); |
|
if (lastNonTerm.IsHiero(params)) { |
|
if (sourceWidth < params.minHoleSource) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
} else if (sourceWidth < params.minHoleSourceSyntax) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
|
|
} |
|
|
|
|
|
int numNonTerms = 0; |
|
int numHieroNonTerms = 0; |
|
for (size_t i = 0; i < m_source.GetSize(); ++i) { |
|
const RuleSymbol *arc = m_source[i]; |
|
if (arc->IsNonTerm()) { |
|
++numNonTerms; |
|
const NonTerm &nonTerm = *static_cast<const NonTerm*>(arc); |
|
bool isHiero = nonTerm.IsHiero(params); |
|
if (isHiero) { |
|
++numHieroNonTerms; |
|
} |
|
} |
|
} |
|
|
|
if (numNonTerms >= params.maxNonTerm) { |
|
m_canRecurse = false; |
|
if (numNonTerms > params.maxNonTerm) { |
|
m_isValid = false; |
|
return; |
|
} |
|
} |
|
|
|
if (numHieroNonTerms >= params.maxHieroNonTerm) { |
|
m_canRecurse = false; |
|
if (numHieroNonTerms > params.maxHieroNonTerm) { |
|
m_isValid = false; |
|
return; |
|
} |
|
} |
|
|
|
|
|
if (!params.nonTermConsecSource && m_nonterms.size() >= 2) { |
|
const NonTerm &lastNonTerm = *m_nonterms.back(); |
|
const NonTerm &secondLastNonTerm = *m_nonterms[m_nonterms.size() - 2]; |
|
if (secondLastNonTerm.GetConsistentPhrase().corners[1] + 1 == |
|
lastNonTerm.GetConsistentPhrase().corners[0]) { |
|
if (params.mixedSyntaxType == 0) { |
|
|
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} else { |
|
|
|
switch (params.nonTermConsecSourceMixedSyntax) { |
|
case 0: |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
case 1: |
|
if (lastNonTerm.IsHiero(Moses::Input, params) |
|
&& secondLastNonTerm.IsHiero(Moses::Input, params)) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
break; |
|
case 2: |
|
if (lastNonTerm.IsHiero(Moses::Input, params) |
|
|| secondLastNonTerm.IsHiero(Moses::Input, params)) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
break; |
|
case 3: |
|
break; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
if (m_nonterms.size() >= 2) { |
|
const NonTerm &lastNonTerm = *m_nonterms.back(); |
|
|
|
for (size_t i = 0; i < m_nonterms.size() - 1; ++i) { |
|
const NonTerm &otherNonTerm = *m_nonterms[i]; |
|
bool overlap = lastNonTerm.GetConsistentPhrase().TargetOverlap(otherNonTerm.GetConsistentPhrase()); |
|
|
|
if (overlap) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
} |
|
} |
|
|
|
|
|
if (params.requireAlignedWord) { |
|
bool ok = false; |
|
for (size_t i = 0; i < m_source.GetSize(); ++i) { |
|
const RuleSymbol &symbol = *m_source[i]; |
|
if (!symbol.IsNonTerm()) { |
|
const Word &word = static_cast<const Word&>(symbol); |
|
if (word.GetAlignment().size()) { |
|
ok = true; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
if (!ok) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
} |
|
|
|
if (params.maxSpanFreeNonTermSource) { |
|
const NonTerm *front = dynamic_cast<const NonTerm*>(m_source[0]); |
|
if (front) { |
|
int width = front->GetWidth(Moses::Input); |
|
if (width > params.maxSpanFreeNonTermSource) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
} |
|
|
|
const NonTerm *back = dynamic_cast<const NonTerm*>(m_source.Back()); |
|
if (back) { |
|
int width = back->GetWidth(Moses::Input); |
|
if (width > params.maxSpanFreeNonTermSource) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
} |
|
} |
|
|
|
if (!params.nieceTerminal) { |
|
|
|
std::set<const Word*> terms; |
|
for (size_t i = 0; i < m_source.GetSize(); ++i) { |
|
const Word *word = dynamic_cast<const Word*>(m_source[i]); |
|
if (word) { |
|
terms.insert(word); |
|
} |
|
} |
|
|
|
|
|
for (size_t i = 0; i < m_source.GetSize(); ++i) { |
|
const NonTerm *nonTerm = dynamic_cast<const NonTerm*>(m_source[i]); |
|
if (nonTerm) { |
|
const ConsistentPhrase &cp = nonTerm->GetConsistentPhrase(); |
|
bool containTerm = ContainTerm(cp, terms); |
|
|
|
if (containTerm) { |
|
|
|
|
|
|
|
|
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
} |
|
} |
|
} |
|
|
|
if (params.maxScope != UNDEFINED || params.minScope > 0) { |
|
int scope = GetScope(params); |
|
if (scope > params.maxScope) { |
|
|
|
|
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
|
|
if (scope < params.minScope) { |
|
|
|
|
|
m_isValid = false; |
|
} |
|
} |
|
|
|
|
|
if (params.scopeSpan.size()) { |
|
size_t scope = GetScope(params); |
|
if (scope >= params.scopeSpan.size()) { |
|
|
|
} else { |
|
const std::pair<int,int> &constraint = params.scopeSpan[scope]; |
|
int sourceWidth = m_lhs.GetWidth(Moses::Input); |
|
if (sourceWidth < constraint.first || sourceWidth > constraint.second) { |
|
m_isValid = false; |
|
m_canRecurse = false; |
|
return; |
|
} |
|
} |
|
} |
|
} |
|
|
|
int Rule::GetScope(const Parameter ¶ms) const |
|
{ |
|
size_t scope = 0; |
|
bool previousIsAmbiguous = false; |
|
|
|
if (m_source[0]->IsNonTerm()) { |
|
scope++; |
|
previousIsAmbiguous = true; |
|
} |
|
|
|
for (size_t i = 1; i < m_source.GetSize(); ++i) { |
|
const RuleSymbol *symbol = m_source[i]; |
|
bool isAmbiguous = symbol->IsNonTerm(); |
|
if (isAmbiguous) { |
|
|
|
const NonTerm *nt = static_cast<const NonTerm*>(symbol); |
|
isAmbiguous = nt->IsHiero(Moses::Input, params); |
|
} |
|
|
|
if (isAmbiguous && previousIsAmbiguous) { |
|
scope++; |
|
} |
|
previousIsAmbiguous = isAmbiguous; |
|
} |
|
|
|
if (previousIsAmbiguous) { |
|
scope++; |
|
} |
|
|
|
return scope; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
template<typename T> |
|
bool Contains(const T *sought, const set<const T*> &coll) |
|
{ |
|
std::set<const Word*>::const_iterator iter; |
|
for (iter = coll.begin(); iter != coll.end(); ++iter) { |
|
const Word *found = *iter; |
|
if (sought->CompareString(*found) == 0) { |
|
return true; |
|
} |
|
} |
|
return false; |
|
} |
|
|
|
bool Rule::ContainTerm(const ConsistentPhrase &cp, const std::set<const Word*> &terms) const |
|
{ |
|
const Phrase &sourceSentence = m_alignedSentence.GetPhrase(Moses::Input); |
|
|
|
for (int pos = cp.corners[0]; pos <= cp.corners[1]; ++pos) { |
|
const Word *soughtWord = sourceSentence[pos]; |
|
|
|
|
|
if (Contains(soughtWord, terms)) { |
|
return true; |
|
} |
|
} |
|
return false; |
|
} |
|
|
|
bool CompareTargetNonTerms(const NonTerm *a, const NonTerm *b) |
|
{ |
|
|
|
return a->GetConsistentPhrase().corners[2] < b->GetConsistentPhrase().corners[2]; |
|
} |
|
|
|
void Rule::CreateTarget(const Parameter ¶ms) |
|
{ |
|
if (!m_isValid) { |
|
return; |
|
} |
|
|
|
vector<const NonTerm*> targetNonTerm(m_nonterms); |
|
std::sort(targetNonTerm.begin(), targetNonTerm.end(), CompareTargetNonTerms); |
|
|
|
const NonTerm *cp = NULL; |
|
size_t nonTermInd = 0; |
|
if (nonTermInd < targetNonTerm.size()) { |
|
cp = targetNonTerm[nonTermInd]; |
|
} |
|
|
|
for (int targetPos = m_lhs.GetConsistentPhrase().corners[2]; |
|
targetPos <= m_lhs.GetConsistentPhrase().corners[3]; |
|
++targetPos) { |
|
|
|
const RuleSymbol *ruleSymbol; |
|
if (cp && cp->GetConsistentPhrase().corners[2] <= targetPos && targetPos <= cp->GetConsistentPhrase().corners[3]) { |
|
|
|
ruleSymbol = cp; |
|
targetPos = cp->GetConsistentPhrase().corners[3]; |
|
if (targetNonTerm.size()) { |
|
cp = targetNonTerm[nonTermInd]; |
|
} |
|
|
|
|
|
++nonTermInd; |
|
cp = (nonTermInd < targetNonTerm.size()) ? targetNonTerm[nonTermInd] : NULL; |
|
} else { |
|
|
|
ruleSymbol = m_alignedSentence.GetPhrase(Moses::Output)[targetPos]; |
|
} |
|
|
|
m_target.Add(ruleSymbol); |
|
} |
|
|
|
CreateAlignments(); |
|
} |
|
|
|
|
|
void Rule::CreateAlignments() |
|
{ |
|
for (size_t sourcePos = 0; sourcePos < m_source.GetSize(); ++sourcePos) { |
|
const RuleSymbol *symbol = m_source[sourcePos]; |
|
if (!symbol->IsNonTerm()) { |
|
|
|
const Word &sourceWord = static_cast<const Word&>(*symbol); |
|
const std::set<const Word *> &targetWords = sourceWord.GetAlignment(); |
|
CreateAlignments(sourcePos, targetWords); |
|
} else { |
|
|
|
CreateAlignments(sourcePos, symbol); |
|
} |
|
} |
|
} |
|
|
|
void Rule::CreateAlignments(int sourcePos, const std::set<const Word *> &targetWords) |
|
{ |
|
std::set<const Word *>::const_iterator iterTarget; |
|
for (iterTarget = targetWords.begin(); iterTarget != targetWords.end(); ++iterTarget) { |
|
const Word *targetWord = *iterTarget; |
|
CreateAlignments(sourcePos, targetWord); |
|
} |
|
} |
|
|
|
void Rule::CreateAlignments(int sourcePos, const RuleSymbol *targetSought) |
|
{ |
|
|
|
for (size_t targetPos = 0; targetPos < m_target.GetSize(); ++targetPos) { |
|
const RuleSymbol *foundSymbol = m_target[targetPos]; |
|
if (targetSought == foundSymbol) { |
|
pair<int, int> alignPoint(sourcePos, targetPos); |
|
m_alignments.insert(alignPoint); |
|
return; |
|
} |
|
} |
|
|
|
throw "not found"; |
|
} |
|
|
|
void Rule::CreateProperties(const Parameter ¶ms) |
|
{ |
|
|
|
|
|
|
|
if (params.spanLength && m_nonterms.size()) { |
|
stringstream strme; |
|
strme << "{{SpanLength "; |
|
|
|
for (size_t i = 0; i < m_nonterms.size(); ++i) { |
|
const NonTerm &nonTerm = *m_nonterms[i]; |
|
const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase(); |
|
strme << i << "," << cp.GetWidth(Moses::Input) << "," << cp.GetWidth(Moses::Output) << " "; |
|
} |
|
strme << "}}"; |
|
|
|
m_properties.push_back(strme.str()); |
|
} |
|
|
|
if (params.ruleLength && m_nonterms.size()) { |
|
const ConsistentPhrase &cp = m_lhs.GetConsistentPhrase(); |
|
|
|
stringstream strme; |
|
strme << "{{RuleLength "; |
|
strme << cp.GetWidth(Moses::Input); |
|
strme << "}}"; |
|
|
|
m_properties.push_back(strme.str()); |
|
} |
|
|
|
|
|
if (params.nonTermContext && m_nonterms.size()) { |
|
stringstream strme; |
|
strme << "{{NonTermContext "; |
|
|
|
int factor = params.nonTermContextFactor; |
|
|
|
for (size_t i = 0; i < m_nonterms.size(); ++i) { |
|
const NonTerm &nonTerm = *m_nonterms[i]; |
|
const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase(); |
|
NonTermContext(1, factor, i, cp, strme); |
|
} |
|
strme << "}}"; |
|
|
|
m_properties.push_back(strme.str()); |
|
} |
|
|
|
|
|
if (params.nonTermContextTarget && m_nonterms.size()) { |
|
stringstream strme; |
|
strme << "{{NonTermContextTarget "; |
|
|
|
int factor = params.nonTermContextFactor; |
|
|
|
for (size_t i = 0; i < m_nonterms.size(); ++i) { |
|
const NonTerm &nonTerm = *m_nonterms[i]; |
|
const ConsistentPhrase &cp = nonTerm.GetConsistentPhrase(); |
|
NonTermContext(2, factor, i, cp, strme); |
|
} |
|
strme << "}}"; |
|
|
|
m_properties.push_back(strme.str()); |
|
} |
|
|
|
} |
|
|
|
|
|
|