|
|
|
|
|
|
|
|
|
|
|
|
|
#include <boost/foreach.hpp> |
|
#include "StoreTarget.h" |
|
#include "line_splitter.h" |
|
#include "probing_hash_utils.h" |
|
#include "OutputFileStream.h" |
|
#include "moses2/legacy/Util2.h" |
|
|
|
using namespace std; |
|
|
|
namespace probingpt |
|
{ |
|
|
|
StoreTarget::StoreTarget(const std::string &basepath) |
|
:m_basePath(basepath) |
|
,m_vocab(basepath + "/TargetVocab.dat") |
|
{ |
|
std::string path = basepath + "/TargetColl.dat"; |
|
m_fileTargetColl.open(path.c_str(), |
|
std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc); |
|
if (!m_fileTargetColl.is_open()) { |
|
throw "can't create file "; |
|
} |
|
|
|
} |
|
|
|
StoreTarget::~StoreTarget() |
|
{ |
|
assert(m_coll.empty()); |
|
m_fileTargetColl.close(); |
|
|
|
|
|
m_vocab.Save(); |
|
} |
|
|
|
uint64_t StoreTarget::Save() |
|
{ |
|
uint64_t ret = m_fileTargetColl.tellp(); |
|
|
|
|
|
uint64_t numTP = m_coll.size(); |
|
m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t)); |
|
|
|
for (size_t i = 0; i < m_coll.size(); ++i) { |
|
Save(*m_coll[i]); |
|
} |
|
|
|
|
|
Moses2::RemoveAllInColl(m_coll); |
|
m_coll.clear(); |
|
|
|
|
|
return ret; |
|
} |
|
|
|
void StoreTarget::Save(const target_text &rule) |
|
{ |
|
|
|
TargetPhraseInfo tpInfo; |
|
tpInfo.alignTerm = GetAlignId(rule.word_align_term); |
|
tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term); |
|
tpInfo.numWords = rule.target_phrase.size(); |
|
tpInfo.propLength = rule.property.size(); |
|
|
|
|
|
m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo)); |
|
|
|
|
|
for (size_t i = 0; i < rule.prob.size(); ++i) { |
|
float prob = rule.prob[i]; |
|
m_fileTargetColl.write((char*) &prob, sizeof(prob)); |
|
} |
|
|
|
|
|
for (size_t i = 0; i < rule.target_phrase.size(); ++i) { |
|
uint32_t vocabId = rule.target_phrase[i]; |
|
m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId)); |
|
} |
|
|
|
|
|
|
|
} |
|
|
|
void StoreTarget::SaveAlignment() |
|
{ |
|
std::string path = m_basePath + "/Alignments.dat"; |
|
probingpt::OutputFileStream file(path); |
|
|
|
BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) { |
|
file << valPair.second << "\t"; |
|
|
|
const std::vector<size_t> &aligns = valPair.first; |
|
BOOST_FOREACH(size_t align, aligns) { |
|
file << align << " "; |
|
} |
|
file << endl; |
|
} |
|
|
|
} |
|
|
|
void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg) |
|
{ |
|
target_text *rule = new target_text; |
|
|
|
|
|
|
|
vector<bool> nonTerms; |
|
util::TokenIter<util::SingleCharacter> it; |
|
it = util::TokenIter<util::SingleCharacter>(line.target_phrase, |
|
util::SingleCharacter(' ')); |
|
while (it) { |
|
StringPiece word = *it; |
|
|
|
|
|
bool nonTerm = false; |
|
if (scfg) { |
|
|
|
if (scfg && word[0] == '[' && word[word.size() - 1] == ']') { |
|
|
|
nonTerm = true; |
|
} |
|
nonTerms.push_back(nonTerm); |
|
} |
|
|
|
util::TokenIter<util::SingleCharacter> itFactor; |
|
itFactor = util::TokenIter<util::SingleCharacter>(word, |
|
util::SingleCharacter('|')); |
|
while (itFactor) { |
|
StringPiece factor = *itFactor; |
|
|
|
string factorStr = factor.as_string(); |
|
uint32_t vocabId = m_vocab.GetVocabId(factorStr); |
|
|
|
rule->target_phrase.push_back(vocabId); |
|
|
|
itFactor++; |
|
} |
|
|
|
it++; |
|
} |
|
|
|
|
|
it = util::TokenIter<util::SingleCharacter>(line.prob, |
|
util::SingleCharacter(' ')); |
|
while (it) { |
|
string tok = it->as_string(); |
|
float prob = Moses2::Scan<float>(tok); |
|
|
|
if (log_prob) { |
|
prob = Moses2::FloorScore(log(prob)); |
|
if (prob == 0.0f) prob = 0.0000000001; |
|
} |
|
|
|
rule->prob.push_back(prob); |
|
it++; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
it = util::TokenIter<util::SingleCharacter>(line.word_align, |
|
util::SingleCharacter(' ')); |
|
while (it) { |
|
string tokPair = Moses2::Trim(it->as_string()); |
|
if (tokPair.empty()) { |
|
break; |
|
} |
|
|
|
vector<size_t> alignPair = Moses2::Tokenize<size_t>(tokPair, "-"); |
|
assert(alignPair.size() == 2); |
|
|
|
bool nonTerm = false; |
|
size_t sourcePos = alignPair[0]; |
|
size_t targetPos = alignPair[1]; |
|
if (scfg) { |
|
nonTerm = nonTerms[targetPos]; |
|
} |
|
|
|
|
|
|
|
if (nonTerm) { |
|
rule->word_align_non_term.push_back(sourcePos); |
|
rule->word_align_non_term.push_back(targetPos); |
|
|
|
} else { |
|
rule->word_align_term.push_back(sourcePos); |
|
rule->word_align_term.push_back(targetPos); |
|
} |
|
|
|
it++; |
|
} |
|
|
|
|
|
string prop = line.property.as_string(); |
|
AppendLexRO(prop, rule->prob, log_prob); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m_coll.push_back(rule); |
|
} |
|
|
|
uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align) |
|
{ |
|
boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter = |
|
m_aligns.find(align); |
|
if (iter == m_aligns.end()) { |
|
uint32_t ind = m_aligns.size(); |
|
m_aligns[align] = ind; |
|
return ind; |
|
} else { |
|
return iter->second; |
|
} |
|
} |
|
|
|
void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector, |
|
bool log_prob) const |
|
{ |
|
size_t startPos = prop.find("{{LexRO "); |
|
|
|
if (startPos != string::npos) { |
|
size_t endPos = prop.find("}}", startPos + 8); |
|
string lexProb = prop.substr(startPos + 8, endPos - startPos - 8); |
|
|
|
|
|
|
|
vector<float> scores = Moses2::Tokenize<float>(lexProb); |
|
|
|
if (log_prob) { |
|
for (size_t i = 0; i < scores.size(); ++i) { |
|
scores[i] = Moses2::FloorScore(log(scores[i])); |
|
if (scores[i] == 0.0f) scores[i] = 0.0000000001; |
|
} |
|
} |
|
|
|
for (size_t i = 0; i < scores.size(); ++i) { |
|
retvector.push_back(scores[i]); |
|
} |
|
|
|
|
|
prop = prop.substr(0, startPos) |
|
+ prop.substr(endPos + 2, prop.size() - endPos - 2); |
|
|
|
} |
|
} |
|
|
|
} |
|
|