|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <sstream> |
|
#include "moses/Util.h" |
|
#include "AlignedSentence.h" |
|
#include "Parameter.h" |
|
|
|
using namespace std; |
|
|
|
|
|
|
|
AlignedSentence::AlignedSentence(int lineNum, |
|
const std::string &source, |
|
const std::string &target, |
|
const std::string &alignment) |
|
:m_lineNum(lineNum) |
|
{ |
|
PopulateWordVec(m_source, source); |
|
PopulateWordVec(m_target, target); |
|
PopulateAlignment(alignment); |
|
} |
|
|
|
AlignedSentence::~AlignedSentence() |
|
{ |
|
Moses::RemoveAllInColl(m_source); |
|
Moses::RemoveAllInColl(m_target); |
|
} |
|
|
|
void AlignedSentence::PopulateWordVec(Phrase &vec, const std::string &line) |
|
{ |
|
std::vector<string> toks; |
|
Moses::Tokenize(toks, line); |
|
|
|
vec.resize(toks.size()); |
|
for (size_t i = 0; i < vec.size(); ++i) { |
|
const string &tok = toks[i]; |
|
Word *word = new Word(i, tok); |
|
vec[i] = word; |
|
} |
|
} |
|
|
|
void AlignedSentence::PopulateAlignment(const std::string &line) |
|
{ |
|
vector<string> alignStr; |
|
Moses::Tokenize(alignStr, line); |
|
|
|
for (size_t i = 0; i < alignStr.size(); ++i) { |
|
vector<int> alignPair; |
|
Moses::Tokenize(alignPair, alignStr[i], "-"); |
|
assert(alignPair.size() == 2); |
|
|
|
int sourcePos = alignPair[0]; |
|
int targetPos = alignPair[1]; |
|
|
|
if (sourcePos >= m_source.size()) { |
|
cerr << "ERROR1:AlignedSentence=" << Debug() << endl; |
|
cerr << "m_source=" << m_source.size() << endl; |
|
abort(); |
|
} |
|
assert(sourcePos < m_source.size()); |
|
assert(targetPos < m_target.size()); |
|
Word *sourceWord = m_source[sourcePos]; |
|
Word *targetWord = m_target[targetPos]; |
|
|
|
sourceWord->AddAlignment(targetWord); |
|
targetWord->AddAlignment(sourceWord); |
|
} |
|
} |
|
|
|
std::string AlignedSentence::Debug() const |
|
{ |
|
stringstream out; |
|
out << "m_lineNum:"; |
|
out << m_lineNum; |
|
out << endl; |
|
|
|
out << "m_source:"; |
|
out << m_source.Debug(); |
|
out << endl; |
|
|
|
out << "m_target:"; |
|
out << m_target.Debug(); |
|
out << endl; |
|
|
|
out << "consistent phrases:" << endl; |
|
out << m_consistentPhrases.Debug(); |
|
out << endl; |
|
|
|
return out.str(); |
|
} |
|
|
|
std::vector<int> AlignedSentence::GetSourceAlignmentCount() const |
|
{ |
|
vector<int> ret(m_source.size()); |
|
|
|
for (size_t i = 0; i < m_source.size(); ++i) { |
|
const Word &word = *m_source[i]; |
|
ret[i] = word.GetAlignmentIndex().size(); |
|
} |
|
return ret; |
|
} |
|
|
|
void AlignedSentence::Create(const Parameter ¶ms) |
|
{ |
|
CreateConsistentPhrases(params); |
|
m_consistentPhrases.AddHieroNonTerms(params); |
|
} |
|
|
|
void AlignedSentence::CreateConsistentPhrases(const Parameter ¶ms) |
|
{ |
|
int countT = m_target.size(); |
|
int countS = m_source.size(); |
|
|
|
m_consistentPhrases.Initialize(countS); |
|
|
|
|
|
for(int lengthT=1; |
|
lengthT <= params.maxSpan && lengthT <= countT; |
|
lengthT++) { |
|
for(int startT=0; startT < countT-(lengthT-1); startT++) { |
|
|
|
|
|
int endT = startT + lengthT - 1; |
|
|
|
|
|
|
|
int minS = 9999; |
|
int maxS = -1; |
|
vector< int > usedS = GetSourceAlignmentCount(); |
|
for(int ti=startT; ti<=endT; ti++) { |
|
const Word &word = *m_target[ti]; |
|
const std::set<int> &alignment = word.GetAlignmentIndex(); |
|
|
|
std::set<int>::const_iterator iterAlign; |
|
for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) { |
|
int si = *iterAlign; |
|
if (si<minS) { |
|
minS = si; |
|
} |
|
if (si>maxS) { |
|
maxS = si; |
|
} |
|
usedS[ si ]--; |
|
} |
|
} |
|
|
|
|
|
if( maxS == -1 ) |
|
continue; |
|
|
|
|
|
size_t width = maxS - minS + 1; |
|
|
|
if( width < params.minSpan ) |
|
continue; |
|
|
|
if( width > params.maxSpan ) |
|
continue; |
|
|
|
|
|
bool out_of_bounds = false; |
|
for(int si=minS; si<=maxS && !out_of_bounds; si++) |
|
if (usedS[si]>0) { |
|
out_of_bounds = true; |
|
} |
|
|
|
|
|
if (out_of_bounds) |
|
continue; |
|
|
|
|
|
|
|
for(int startS=minS; |
|
(startS>=0 && |
|
startS>maxS - params.maxSpan && |
|
(startS==minS || m_source[startS]->GetAlignment().size()==0)); |
|
startS--) { |
|
|
|
for(int endS=maxS; |
|
(endS<countS && endS<startS + params.maxSpan && |
|
(endS==maxS || m_source[endS]->GetAlignment().size()==0)); |
|
endS++) { |
|
|
|
|
|
m_consistentPhrases.Add(startS, endS, startT, endT, params); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|