|
#include "HwcmScorer.h" |
|
|
|
#include <fstream> |
|
|
|
#include "ScoreStats.h" |
|
#include "Util.h" |
|
|
|
#include "util/tokenize_piece.hh" |
|
|
|
|
|
|
|
|
|
|
|
using namespace std; |
|
|
|
namespace MosesTuning |
|
{ |
|
|
|
|
|
HwcmScorer::HwcmScorer(const string& config) |
|
: StatisticsBasedScorer("HWCM",config) {} |
|
|
|
HwcmScorer::~HwcmScorer() {} |
|
|
|
void HwcmScorer::setReferenceFiles(const vector<string>& referenceFiles) |
|
{ |
|
|
|
if (referenceFiles.size() != 1) { |
|
throw runtime_error("HWCM only supports a single reference"); |
|
} |
|
m_ref_trees.clear(); |
|
m_ref_hwc.clear(); |
|
ifstream in((referenceFiles[0] + ".trees").c_str()); |
|
if (!in) { |
|
throw runtime_error("Unable to open " + referenceFiles[0] + ".trees"); |
|
} |
|
string line; |
|
while (getline(in,line)) { |
|
line = this->preprocessSentence(line); |
|
TreePointer tree (boost::make_shared<InternalTree>(line)); |
|
m_ref_trees.push_back(tree); |
|
vector<map<string, int> > hwc (kHwcmOrder); |
|
vector<string> history(kHwcmOrder); |
|
extractHeadWordChain(tree, history, hwc); |
|
m_ref_hwc.push_back(hwc); |
|
vector<int> totals(kHwcmOrder); |
|
for (size_t i = 0; i < kHwcmOrder; i++) { |
|
for (map<string, int>::const_iterator it = m_ref_hwc.back()[i].begin(); it != m_ref_hwc.back()[i].end(); it++) { |
|
totals[i] += it->second; |
|
} |
|
} |
|
m_ref_lengths.push_back(totals); |
|
} |
|
TRACE_ERR(endl); |
|
|
|
} |
|
|
|
void HwcmScorer::extractHeadWordChain(TreePointer tree, vector<string> & history, vector<map<string, int> > & hwc) |
|
{ |
|
|
|
if (tree->GetLength() > 0) { |
|
string head = getHead(tree); |
|
|
|
if (head.empty()) { |
|
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) { |
|
extractHeadWordChain(*it, history, hwc); |
|
} |
|
} else { |
|
vector<string> new_history(kHwcmOrder); |
|
new_history[0] = head; |
|
hwc[0][head]++; |
|
for (size_t hist_idx = 0; hist_idx < kHwcmOrder-1; hist_idx++) { |
|
if (!history[hist_idx].empty()) { |
|
string chain = history[hist_idx] + " " + head; |
|
hwc[hist_idx+1][chain]++; |
|
if (hist_idx+2 < kHwcmOrder) { |
|
new_history[hist_idx+1] = chain; |
|
} |
|
} |
|
} |
|
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) { |
|
extractHeadWordChain(*it, new_history, hwc); |
|
} |
|
} |
|
} |
|
} |
|
|
|
string HwcmScorer::getHead(TreePointer tree) |
|
{ |
|
|
|
|
|
for (std::vector<TreePointer>::const_iterator it = tree->GetChildren().begin(); it != tree->GetChildren().end(); ++it) { |
|
TreePointer child = *it; |
|
|
|
if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) { |
|
return child->GetChildren()[0]->GetLabel(); |
|
} |
|
} |
|
return ""; |
|
|
|
} |
|
|
|
void HwcmScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) |
|
{ |
|
if (sid >= m_ref_trees.size()) { |
|
stringstream msg; |
|
msg << "Sentence id (" << sid << ") not found in reference set"; |
|
throw runtime_error(msg.str()); |
|
} |
|
|
|
string sentence = this->preprocessSentence(text); |
|
|
|
|
|
|
|
util::TokenIter<util::MultiCharacter> it(sentence, util::MultiCharacter("|||")); |
|
++it; |
|
if (it) { |
|
sentence = it->as_string(); |
|
} |
|
|
|
TreePointer tree (boost::make_shared<InternalTree>(sentence)); |
|
vector<map<string, int> > hwc_test (kHwcmOrder); |
|
vector<string> history(kHwcmOrder); |
|
extractHeadWordChain(tree, history, hwc_test); |
|
|
|
ostringstream stats; |
|
for (size_t i = 0; i < kHwcmOrder; i++) { |
|
int correct = 0; |
|
int test_total = 0; |
|
for (map<string, int>::const_iterator it = hwc_test[i].begin(); it != hwc_test[i].end(); it++) { |
|
test_total += it->second; |
|
map<string, int>::const_iterator it2 = m_ref_hwc[sid][i].find(it->first); |
|
if (it2 != m_ref_hwc[sid][i].end()) { |
|
correct += std::min(it->second, it2->second); |
|
} |
|
} |
|
stats << correct << " " << test_total << " " << m_ref_lengths[sid][i] << " " ; |
|
} |
|
|
|
string stats_str = stats.str(); |
|
entry.set(stats_str); |
|
} |
|
|
|
float HwcmScorer::calculateScore(const vector<ScoreStatsType>& comps) const |
|
{ |
|
float precision = 0; |
|
float recall = 0; |
|
for (size_t i = 0; i < kHwcmOrder; i++) { |
|
float matches = comps[i*3]; |
|
float test_total = comps[1+(i*3)]; |
|
float ref_total = comps[2+(i*3)]; |
|
if (test_total > 0) { |
|
precision += matches/test_total; |
|
} |
|
if (ref_total > 0) { |
|
recall += matches/ref_total; |
|
} |
|
} |
|
|
|
precision /= (float)kHwcmOrder; |
|
recall /= (float)kHwcmOrder; |
|
return (2*precision*recall)/(precision+recall); |
|
} |
|
|
|
} |