|
#include "RDLM.h" |
|
#include <vector> |
|
#include "moses/StaticData.h" |
|
#include "moses/ScoreComponentCollection.h" |
|
#include "moses/ChartHypothesis.h" |
|
#include "moses/InputFileStream.h" |
|
#include "moses/Util.h" |
|
#include "util/exception.hh" |
|
#include "neuralTM.h" |
|
|
|
namespace Moses |
|
{ |
|
|
|
namespace rdlm |
|
{ |
|
ThreadLocal::ThreadLocal(nplm::neuralTM *lm_head_base_instance_, nplm::neuralTM *lm_label_base_instance_, bool normalizeHeadLM, bool normalizeLabelLM, int cacheSize) |
|
{ |
|
lm_head = new nplm::neuralTM(*lm_head_base_instance_); |
|
lm_label = new nplm::neuralTM(*lm_label_base_instance_); |
|
lm_head->set_normalization(normalizeHeadLM); |
|
lm_label->set_normalization(normalizeLabelLM); |
|
lm_head->set_cache(cacheSize); |
|
lm_label->set_cache(cacheSize); |
|
} |
|
|
|
ThreadLocal::~ThreadLocal() |
|
{ |
|
delete lm_head; |
|
delete lm_label; |
|
} |
|
|
|
} |
|
|
|
typedef Eigen::Map<Eigen::Matrix<int,Eigen::Dynamic,1> > EigenMap; |
|
|
|
RDLM::~RDLM() |
|
{ |
|
delete lm_head_base_instance_; |
|
delete lm_label_base_instance_; |
|
} |
|
|
|
void RDLM::Load(AllOptions::ptr const& opts) |
|
{ |
|
|
|
lm_head_base_instance_ = new nplm::neuralTM(); |
|
lm_head_base_instance_->read(m_path_head_lm); |
|
|
|
m_sharedVocab = lm_head_base_instance_->get_input_vocabulary().words() == lm_head_base_instance_->get_output_vocabulary().words(); |
|
|
|
|
|
lm_label_base_instance_ = new nplm::neuralTM(); |
|
lm_label_base_instance_->read(m_path_label_lm); |
|
|
|
if (m_premultiply) { |
|
lm_head_base_instance_->premultiply(); |
|
lm_label_base_instance_->premultiply(); |
|
} |
|
|
|
lm_head_base_instance_->set_cache(m_cacheSize); |
|
lm_label_base_instance_->set_cache(m_cacheSize); |
|
|
|
StaticData &staticData = StaticData::InstanceNonConst(); |
|
if (staticData.GetTreeStructure() == NULL) { |
|
staticData.SetTreeStructure(this); |
|
} |
|
|
|
offset_up_head = 2*m_context_left + 2*m_context_right; |
|
offset_up_label = 2*m_context_left + 2*m_context_right + m_context_up; |
|
|
|
size_head = 2*m_context_left + 2*m_context_right + 2*m_context_up + 2; |
|
size_label = 2*m_context_left + 2*m_context_right + 2*m_context_up + 1; |
|
|
|
UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(), |
|
"Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head); |
|
UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(), |
|
"Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label); |
|
|
|
|
|
static_head_null.resize(size_head); |
|
for (unsigned int i = 0; i < size_head; i++) { |
|
char numstr[20]; |
|
sprintf(numstr, "<null_%d>", i); |
|
static_head_null[i] = lm_head_base_instance_->lookup_input_word(numstr); |
|
} |
|
|
|
static_label_null.resize(size_label); |
|
for (unsigned int i = 0; i < size_label; i++) { |
|
char numstr[20]; |
|
sprintf(numstr, "<null_%d>", i); |
|
static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr); |
|
} |
|
|
|
static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head.GetString(0).as_string()); |
|
|
|
static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>"); |
|
static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>"); |
|
|
|
static_head_head = lm_head_base_instance_->lookup_input_word("<head_head>"); |
|
static_head_label = lm_head_base_instance_->lookup_input_word("<head_label>"); |
|
static_head_label_output = lm_label_base_instance_->lookup_output_word("<head_label>"); |
|
|
|
static_stop_head = lm_head_base_instance_->lookup_input_word("<stop_head>"); |
|
static_stop_label = lm_head_base_instance_->lookup_input_word("<stop_label>"); |
|
static_stop_label_output = lm_label_base_instance_->lookup_output_word("<stop_label>"); |
|
static_start_label_output = lm_label_base_instance_->lookup_output_word("<start_label>"); |
|
|
|
static_root_head = lm_head_base_instance_->lookup_input_word("<root_head>"); |
|
static_root_label = lm_head_base_instance_->lookup_input_word("<root_label>"); |
|
|
|
|
|
if (!m_debugPath.empty()) { |
|
ScoreFile(m_debugPath); |
|
exit(1); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float, 4> &score, size_t &boundary_hash, rdlm::ThreadLocal &thread_objects, int num_virtual, int rescoring_levels) const |
|
{ |
|
|
|
|
|
if (root->IsTerminal()) { |
|
return; |
|
} |
|
|
|
|
|
if (root->GetLabel() == m_glueSymbol) { |
|
|
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) { |
|
Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels); |
|
} |
|
return; |
|
} |
|
|
|
std::vector<int> &ancestor_heads = thread_objects.ancestor_heads; |
|
std::vector<int> &ancestor_labels = thread_objects.ancestor_labels; |
|
|
|
|
|
if (m_binarized && root->GetLabel().GetString(0).as_string()[0] == '^' && !ancestor_heads.empty()) { |
|
|
|
if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) { |
|
root = back_pointers.find(root)->second.get(); |
|
rescoring_levels = m_context_up-1; |
|
} |
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) { |
|
Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels); |
|
} |
|
return; |
|
} |
|
|
|
|
|
if (root->GetLabel() == m_startSymbol || root->GetLabel() == m_endSymbol) { |
|
return; |
|
} |
|
|
|
|
|
|
|
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) { |
|
|
|
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) { |
|
std::vector<int> & ngram = thread_objects.ngram; |
|
ngram = static_head_null; |
|
ngram.back() = Factor2ID(root->GetChildren()[0]->GetLabel()[m_factorType], HEAD_OUTPUT); |
|
if (m_isPretermBackoff && ngram.back() == 0) { |
|
ngram.back() = Factor2ID(root->GetLabel()[m_factorType], HEAD_OUTPUT); |
|
} |
|
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) { |
|
std::vector<int>::iterator it = ngram.begin(); |
|
std::fill_n(it, m_context_left, static_start_head); |
|
it += m_context_left; |
|
std::fill_n(it, m_context_left, static_start_label); |
|
it += m_context_left; |
|
std::fill_n(it, m_context_right, static_stop_head); |
|
it += m_context_right; |
|
std::fill_n(it, m_context_right, static_stop_label); |
|
it += m_context_right; |
|
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size()); |
|
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it); |
|
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it); |
|
} |
|
if (ancestor_labels.size() >= m_context_up && !num_virtual) { |
|
score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} else { |
|
boost::hash_combine(boundary_hash, ngram.back()); |
|
score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} |
|
} |
|
return; |
|
|
|
} else if (root->IsLeafNT()) { |
|
if (m_context_up > 1 && ancestor_heads.size()) { |
|
root = back_pointers.find(root)->second.get(); |
|
|
|
if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) { |
|
return; |
|
} |
|
rescoring_levels = m_context_up-1; |
|
} else { |
|
return; |
|
} |
|
} |
|
|
|
|
|
std::pair<int,int> head_ids; |
|
bool found = GetHead(root, back_pointers, head_ids); |
|
if (!found) { |
|
head_ids = std::make_pair(static_dummy_head, static_dummy_head); |
|
} |
|
|
|
size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size()); |
|
const StringPiece & head_label = root->GetLabel().GetString(0); |
|
bool virtual_head = false; |
|
int reached_end = 0; |
|
int label_idx, label_idx_out; |
|
if (m_binarized && head_label[0] == '^') { |
|
virtual_head = true; |
|
if (m_binarized == 1 || (m_binarized == 3 && head_label[2] == 'l')) { |
|
reached_end = 1; |
|
} else if (m_binarized == 2 || (m_binarized == 3 && head_label[2] == 'r')) { |
|
reached_end = 2; |
|
} |
|
|
|
StringPiece clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1); |
|
label_idx = lm_label_base_instance_->lookup_input_word(clipped_label.as_string()); |
|
label_idx_out = lm_label_base_instance_->lookup_output_word(clipped_label.as_string()); |
|
} else { |
|
reached_end = 3; |
|
label_idx = Factor2ID(root->GetLabel()[0], LABEL_INPUT); |
|
label_idx_out = Factor2ID(root->GetLabel()[0], LABEL_OUTPUT); |
|
} |
|
|
|
int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first; |
|
|
|
|
|
if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) { |
|
std::vector<int> & ngram = thread_objects.ngram; |
|
ngram = static_label_null; |
|
ngram.back() = label_idx_out; |
|
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) { |
|
std::vector<int>::iterator it = ngram.begin(); |
|
std::fill_n(it, m_context_left, static_start_head); |
|
it += m_context_left; |
|
std::fill_n(it, m_context_left, static_start_label); |
|
it += m_context_left; |
|
std::fill_n(it, m_context_right, static_stop_head); |
|
it += m_context_right; |
|
std::fill_n(it, m_context_right, static_stop_label); |
|
it += m_context_right; |
|
it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it); |
|
it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it); |
|
score[2] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} else { |
|
boost::hash_combine(boundary_hash, ngram.back()); |
|
score[3] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} |
|
if (head_idx != static_dummy_head && head_idx != static_head_head) { |
|
ngram.push_back(head_ids.second); |
|
*(ngram.end()-2) = label_idx; |
|
if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) { |
|
score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} else { |
|
boost::hash_combine(boundary_hash, ngram.back()); |
|
score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} |
|
} |
|
} |
|
|
|
ancestor_heads.push_back(head_idx); |
|
ancestor_labels.push_back(label_idx); |
|
|
|
if (virtual_head) { |
|
num_virtual = m_context_up; |
|
} else if (num_virtual) { |
|
--num_virtual; |
|
} |
|
|
|
|
|
|
|
if (context_up_nonempty < m_context_up) { |
|
++context_up_nonempty; |
|
} |
|
size_t up_padding = m_context_up - context_up_nonempty; |
|
|
|
std::vector<int> & ngram = thread_objects.ngram; |
|
ngram = static_label_null; |
|
|
|
std::vector<int>::iterator it = ngram.begin() + offset_up_head; |
|
if (up_padding > 0) { |
|
it += up_padding; |
|
} |
|
|
|
it = std::copy(ancestor_heads.end() - context_up_nonempty, ancestor_heads.end(), it); |
|
|
|
if (up_padding > 0) { |
|
it += up_padding; |
|
} |
|
|
|
it = std::copy(ancestor_labels.end() - context_up_nonempty, ancestor_labels.end(), it); |
|
|
|
|
|
int num_children = root->GetLength(); |
|
|
|
|
|
if (m_binarized) { |
|
num_children = 0; |
|
UnbinarizedChildren real_children(root, back_pointers, m_binarized, thread_objects.stack); |
|
for (std::vector<TreePointer>::const_iterator it = real_children.begin(); !real_children.ended(); it = ++real_children) { |
|
num_children++; |
|
} |
|
} |
|
|
|
if (m_context_right && (reached_end == 1 || reached_end == 3)) num_children++; |
|
if (m_context_left && (reached_end == 2 || reached_end == 3)) num_children++; |
|
std::vector<int> & heads = thread_objects.heads; |
|
std::vector<int> & labels = thread_objects.labels; |
|
std::vector<int> & heads_output = thread_objects.heads_output; |
|
std::vector<int> & labels_output = thread_objects.labels_output; |
|
|
|
heads.resize(num_children); |
|
labels.resize(num_children); |
|
heads_output.resize(num_children); |
|
labels_output.resize(num_children); |
|
|
|
GetChildHeadsAndLabels(root, back_pointers, reached_end, thread_objects); |
|
|
|
|
|
if (reached_end == 1 || reached_end == 3) { |
|
std::fill_n(ngram.begin(), m_context_left, static_start_head); |
|
std::fill_n(ngram.begin() + m_context_left, m_context_left, static_start_label); |
|
} |
|
size_t left_padding = m_context_left; |
|
size_t left_offset = 0; |
|
size_t right_offset = std::min(heads.size(), m_context_right + 1); |
|
size_t right_padding = m_context_right + 1 - right_offset; |
|
|
|
|
|
for (size_t i = 0; i != heads.size(); i++) { |
|
|
|
std::vector<int>::iterator it = ngram.begin(); |
|
|
|
if (left_padding > 0) { |
|
it += left_padding; |
|
} |
|
|
|
it = std::copy(heads.begin()+left_offset, heads.begin()+i, it); |
|
|
|
if (left_padding > 0) { |
|
it += left_padding; |
|
} |
|
|
|
it = std::copy(labels.begin()+left_offset, labels.begin()+i, it); |
|
|
|
it = std::copy(heads.begin()+i+1, heads.begin()+right_offset, it); |
|
|
|
if (right_padding > 0) { |
|
if (reached_end == 2 || reached_end == 3) { |
|
std::fill_n(it, right_padding, static_stop_head); |
|
it += right_padding; |
|
} else { |
|
std::copy(static_label_null.begin()+offset_up_head-m_context_right-right_padding, static_label_null.begin()-m_context_right+offset_up_head, it); |
|
} |
|
} |
|
|
|
it = std::copy(labels.begin()+i+1, labels.begin()+right_offset, it); |
|
|
|
if (right_padding > 0) { |
|
if (reached_end == 2 || reached_end == 3) { |
|
std::fill_n(it, right_padding, static_stop_label); |
|
it += right_padding; |
|
} else { |
|
std::copy(static_label_null.begin()+offset_up_head-right_padding, static_label_null.begin()+offset_up_head, it); |
|
} |
|
} |
|
|
|
ngram.back() = labels_output[i]; |
|
|
|
if (ancestor_labels.size() >= m_context_up && !num_virtual) { |
|
score[2] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} else { |
|
boost::hash_combine(boundary_hash, ngram.back()); |
|
score[3] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} |
|
|
|
|
|
if (heads[i] != static_start_head && heads[i] != static_stop_head && heads[i] != static_dummy_head && heads[i] != static_head_head) { |
|
|
|
ngram.back() = labels[i]; |
|
ngram.push_back(heads_output[i]); |
|
|
|
if (ancestor_labels.size() >= m_context_up && !num_virtual) { |
|
score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} else { |
|
boost::hash_combine(boundary_hash, ngram.back()); |
|
score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size()))); |
|
} |
|
ngram.pop_back(); |
|
} |
|
|
|
|
|
if (left_padding) |
|
left_padding--; |
|
else |
|
left_offset++; |
|
|
|
if (right_offset < heads.size()) |
|
right_offset++; |
|
else |
|
right_padding++; |
|
} |
|
|
|
|
|
if (rescoring_levels == 1) { |
|
ancestor_heads.pop_back(); |
|
ancestor_labels.pop_back(); |
|
return; |
|
} |
|
|
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) { |
|
Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels - 1); |
|
} |
|
ancestor_heads.pop_back(); |
|
ancestor_labels.pop_back(); |
|
} |
|
|
|
bool RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs) const |
|
{ |
|
InternalTree *tree; |
|
|
|
for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) { |
|
if ((*it)->IsLeafNT()) { |
|
tree = back_pointers.find(it->get())->second.get(); |
|
} else { |
|
tree = it->get(); |
|
} |
|
|
|
if (m_binarized && tree->GetLabel().GetString(0).as_string()[0] == '^') { |
|
bool found = GetHead(tree, back_pointers, IDs); |
|
if (found) { |
|
return true; |
|
} |
|
} |
|
|
|
|
|
|
|
else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal()) { |
|
GetIDs(tree->GetChildren()[0]->GetLabel(), tree->GetLabel(), IDs); |
|
return true; |
|
} |
|
} |
|
|
|
return false; |
|
} |
|
|
|
|
|
void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, rdlm::ThreadLocal &thread_objects) const |
|
{ |
|
std::pair<int,int> child_ids; |
|
size_t j = 0; |
|
|
|
std::vector<int> & heads = thread_objects.heads; |
|
std::vector<int> & labels = thread_objects.labels; |
|
std::vector<int> & heads_output = thread_objects.heads_output; |
|
std::vector<int> & labels_output = thread_objects.labels_output; |
|
|
|
|
|
if (m_context_right && (reached_end == 1 || reached_end == 3)) { |
|
heads[j] = static_start_head; |
|
labels[j] = static_start_label; |
|
labels_output[j] = static_start_label_output; |
|
j++; |
|
} |
|
|
|
UnbinarizedChildren real_children(root, back_pointers, m_binarized, thread_objects.stack); |
|
|
|
|
|
for (std::vector<TreePointer>::const_iterator itx = real_children.begin(); !real_children.ended(); itx = ++real_children) { |
|
if ((*itx)->IsTerminal()) { |
|
std::cerr << "non-terminal node " << root->GetLabel() << " has a mix of terminal and non-terminal children. This shouldn't happen..." << std::endl; |
|
std::cerr << "children: "; |
|
for (std::vector<TreePointer>::const_iterator itx2 = root->GetChildren().begin(); itx2 != root->GetChildren().end(); ++itx2) { |
|
std::cerr << (*itx2)->GetLabel() << " "; |
|
} |
|
std::cerr << std::endl; |
|
|
|
heads.pop_back(); |
|
labels.pop_back(); |
|
heads_output.pop_back(); |
|
labels_output.pop_back(); |
|
continue; |
|
} |
|
InternalTree* child = itx->get(); |
|
|
|
if ((*itx)->IsLeafNT()) { |
|
child = back_pointers.find(itx->get())->second.get(); |
|
} |
|
|
|
|
|
if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) { |
|
heads[j] = static_head_head; |
|
labels[j] = static_head_label; |
|
labels_output[j] = static_head_label_output; |
|
j++; |
|
continue; |
|
} |
|
|
|
bool found = GetHead(child, back_pointers, child_ids); |
|
if (!found) { |
|
child_ids = std::make_pair(static_dummy_head, static_dummy_head); |
|
} |
|
|
|
labels[j] = Factor2ID(child->GetLabel()[0], LABEL_INPUT); |
|
labels_output[j] = Factor2ID(child->GetLabel()[0], LABEL_OUTPUT); |
|
heads[j] = child_ids.first; |
|
heads_output[j] = child_ids.second; |
|
j++; |
|
} |
|
|
|
|
|
if (m_context_left && (reached_end == 2 || reached_end == 3)) { |
|
heads[j] = static_stop_head; |
|
labels[j] = static_stop_label; |
|
labels_output[j] = static_stop_label_output; |
|
} |
|
} |
|
|
|
|
|
void RDLM::GetIDs(const Word & head, const Word & preterminal, std::pair<int,int> & IDs) const |
|
{ |
|
IDs.first = Factor2ID(head[m_factorType], HEAD_INPUT); |
|
if (m_isPretermBackoff && IDs.first == 0) { |
|
IDs.first = Factor2ID(preterminal[0], HEAD_INPUT); |
|
} |
|
if (m_sharedVocab) { |
|
IDs.second = IDs.first; |
|
} else { |
|
IDs.second = Factor2ID(head[m_factorType], HEAD_OUTPUT); |
|
if (m_isPretermBackoff && IDs.second == 0) { |
|
IDs.second = Factor2ID(preterminal[0], HEAD_OUTPUT); |
|
} |
|
} |
|
} |
|
|
|
|
|
int RDLM::Factor2ID(const Factor * const factor, int model_type) const |
|
{ |
|
size_t ID = factor->GetId(); |
|
int ret; |
|
|
|
std::vector<int>* cache = NULL; |
|
switch(model_type) { |
|
case LABEL_INPUT: |
|
cache = &factor2id_label_input; |
|
break; |
|
case LABEL_OUTPUT: |
|
cache = &factor2id_label_output; |
|
break; |
|
case HEAD_INPUT: |
|
cache = &factor2id_head_input; |
|
break; |
|
case HEAD_OUTPUT: |
|
cache = &factor2id_head_output; |
|
break; |
|
} |
|
|
|
try { |
|
ret = cache->at(ID); |
|
} catch (const std::out_of_range& oor) { |
|
#ifdef WITH_THREADS |
|
m_accessLock.unlock_shared(); |
|
m_accessLock.lock(); |
|
#endif |
|
cache->resize(ID*2, -1); |
|
#ifdef WITH_THREADS |
|
m_accessLock.unlock(); |
|
m_accessLock.lock_shared(); |
|
#endif |
|
ret = -1; |
|
} |
|
if (ret == -1) { |
|
switch(model_type) { |
|
case LABEL_INPUT: |
|
ret = lm_label_base_instance_->lookup_input_word(factor->GetString().as_string()); |
|
break; |
|
case LABEL_OUTPUT: |
|
ret = lm_label_base_instance_->lookup_output_word(factor->GetString().as_string()); |
|
break; |
|
case HEAD_INPUT: |
|
ret = lm_head_base_instance_->lookup_input_word(factor->GetString().as_string()); |
|
break; |
|
case HEAD_OUTPUT: |
|
ret = lm_head_base_instance_->lookup_output_word(factor->GetString().as_string()); |
|
break; |
|
} |
|
(*cache)[ID] = ret; |
|
} |
|
|
|
return ret; |
|
} |
|
|
|
void RDLM::PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const |
|
{ |
|
for (size_t i = 0; i < ngram.size()-1; i++) { |
|
std::cerr << lm->get_input_vocabulary().words()[ngram[i]] << " "; |
|
} |
|
std::cerr << lm->get_output_vocabulary().words()[ngram.back()] << " "; |
|
|
|
for (size_t i = 0; i < ngram.size(); i++) { |
|
std::cerr << ngram[i] << " "; |
|
} |
|
std::cerr << "score: " << lm->lookup_ngram(ngram) << std::endl; |
|
} |
|
|
|
|
|
RDLM::TreePointerMap RDLM::AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const |
|
{ |
|
|
|
TreePointerMap ret; |
|
std::vector<TreePointer>::iterator it; |
|
bool found = false; |
|
InternalTree::leafNT next_leafNT(root); |
|
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) { |
|
found = next_leafNT(it); |
|
if (found) { |
|
ret[it->get()] = *it_prev; |
|
} else { |
|
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n"; |
|
} |
|
} |
|
return ret; |
|
} |
|
|
|
void RDLM::ScoreFile(std::string &path) |
|
{ |
|
InputFileStream inStream(path); |
|
rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get(); |
|
if (!thread_objects) { |
|
thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize); |
|
thread_objects_backend_.reset(thread_objects); |
|
} |
|
std::string line, null; |
|
thread_objects->ancestor_heads.resize(0); |
|
thread_objects->ancestor_labels.resize(0); |
|
thread_objects->ancestor_heads.resize(m_context_up, static_root_head); |
|
thread_objects->ancestor_labels.resize(m_context_up, static_root_label); |
|
#ifdef WITH_THREADS |
|
|
|
m_accessLock.lock_shared(); |
|
#endif |
|
while(getline(inStream, line)) { |
|
TreePointerMap back_pointers; |
|
boost::array<float, 4> score; |
|
score.fill(0); |
|
InternalTree* mytree (new InternalTree(line)); |
|
size_t boundary_hash = 0; |
|
Score(mytree, back_pointers, score, boundary_hash, *thread_objects); |
|
std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << std::endl; |
|
} |
|
#ifdef WITH_THREADS |
|
m_accessLock.unlock_shared(); |
|
#endif |
|
} |
|
|
|
|
|
void RDLM::SetParameter(const std::string& key, const std::string& value) |
|
{ |
|
std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n"; |
|
if (key == "tuneable") { |
|
m_tuneable = Scan<bool>(value); |
|
} else if (key == "filterable") { |
|
} else if (key == "path_head_lm") { |
|
m_path_head_lm = value; |
|
} else if (key == "path_label_lm") { |
|
m_path_label_lm = value; |
|
} else if (key == "backoff") { |
|
m_isPretermBackoff = Scan<bool>(value); |
|
} else if (key == "context_up") { |
|
m_context_up = Scan<size_t>(value); |
|
} else if (key == "context_left") { |
|
m_context_left = Scan<size_t>(value); |
|
} else if (key == "context_right") { |
|
m_context_right = Scan<size_t>(value); |
|
} else if (key == "debug_path") { |
|
m_debugPath = value; |
|
} else if (key == "premultiply") { |
|
m_premultiply = Scan<bool>(value); |
|
} else if (key == "rerank") { |
|
m_rerank = Scan<bool>(value); |
|
} else if (key == "normalize_head_lm") { |
|
m_normalizeHeadLM = Scan<bool>(value); |
|
} else if (key == "normalize_label_lm") { |
|
m_normalizeLabelLM = Scan<bool>(value); |
|
} else if (key == "binarized") { |
|
if (value == "left") |
|
m_binarized = 1; |
|
else if (value == "right") |
|
m_binarized = 2; |
|
else if (value == "full") |
|
m_binarized = 3; |
|
else |
|
UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value); |
|
} else if (key == "glue_symbol") { |
|
m_glueSymbolString = value; |
|
} else if (key == "factor") { |
|
m_factorType = Scan<FactorType>(value); |
|
} else if (key == "cache_size") { |
|
m_cacheSize = Scan<int>(value); |
|
} else { |
|
UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value); |
|
} |
|
} |
|
|
|
|
|
FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo |
|
, int featureID |
|
, ScoreComponentCollection* accumulator) const |
|
{ |
|
if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) { |
|
const std::string *tree = property->GetValueString(); |
|
TreePointer mytree (boost::make_shared<InternalTree>(*tree)); |
|
|
|
|
|
std::vector<TreePointer> previous_trees; |
|
float prev_approx_head = 0, prev_approx_label = 0; |
|
for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) { |
|
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos); |
|
if (word.IsNonTerminal()) { |
|
size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos]; |
|
const RDLMState* prev = static_cast<const RDLMState*>(cur_hypo.GetPrevHypo(nonTermInd)->GetFFState(featureID)); |
|
previous_trees.push_back(prev->GetTree()); |
|
prev_approx_head -= prev->GetApproximateScoreHead(); |
|
prev_approx_label -= prev->GetApproximateScoreLabel(); |
|
} |
|
} |
|
size_t ff_idx = m_index; |
|
|
|
accumulator->PlusEquals(ff_idx, prev_approx_head); |
|
accumulator->PlusEquals(ff_idx+1, prev_approx_label); |
|
|
|
bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_endTag || (mytree->GetChildren().back()->GetLabel() == m_endSymbol && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_endTag)); |
|
|
|
TreePointerMap back_pointers = AssociateLeafNTs(mytree.get(), previous_trees); |
|
boost::array<float, 4> score; |
|
score.fill(0); |
|
|
|
size_t boundary_hash = 0; |
|
if (!m_rerank) { |
|
#ifdef WITH_THREADS |
|
|
|
m_accessLock.lock_shared(); |
|
#endif |
|
rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get(); |
|
if (!thread_objects) { |
|
thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize); |
|
thread_objects_backend_.reset(thread_objects); |
|
} |
|
thread_objects->ancestor_heads.resize(0); |
|
thread_objects->ancestor_labels.resize(0); |
|
thread_objects->ancestor_heads.resize((full_sentence ? m_context_up : 0), static_root_head); |
|
thread_objects->ancestor_labels.resize((full_sentence ? m_context_up : 0), static_root_label); |
|
Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects); |
|
#ifdef WITH_THREADS |
|
m_accessLock.unlock_shared(); |
|
#endif |
|
accumulator->PlusEquals(ff_idx, score[0] + score[1]); |
|
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]); |
|
} |
|
mytree->Combine(previous_trees); |
|
if (m_rerank && full_sentence) { |
|
#ifdef WITH_THREADS |
|
|
|
m_accessLock.lock_shared(); |
|
#endif |
|
rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get(); |
|
if (!thread_objects) { |
|
thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize); |
|
thread_objects_backend_.reset(thread_objects); |
|
} |
|
thread_objects->ancestor_heads.resize(0); |
|
thread_objects->ancestor_labels.resize(0); |
|
thread_objects->ancestor_heads.resize((full_sentence ? m_context_up : 0), static_root_head); |
|
thread_objects->ancestor_labels.resize((full_sentence ? m_context_up : 0), static_root_label); |
|
Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects); |
|
#ifdef WITH_THREADS |
|
m_accessLock.unlock_shared(); |
|
#endif |
|
accumulator->PlusEquals(ff_idx, score[0] + score[1]); |
|
accumulator->PlusEquals(ff_idx+1, score[2] + score[3]); |
|
} |
|
if (m_binarized && full_sentence) { |
|
mytree->Unbinarize(); |
|
} |
|
|
|
return new RDLMState(mytree, score[1], score[3], boundary_hash); |
|
} else { |
|
UTIL_THROW2("Error: RDLM active, but no internal tree structure found"); |
|
} |
|
|
|
} |
|
|
|
} |
|
|