|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "lexdecom.h" |
|
#include "scorer-impl.h" |
|
|
|
#include <iostream> |
|
#include <fstream> |
|
|
|
PhraseScorer* |
|
LexicalDecompositionPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) |
|
{ |
|
|
|
if(argv[argp] == NULL) |
|
usage(); |
|
|
|
String lwfile(argv[argp++]); |
|
|
|
return new LexicalDecompositionPhraseScorer(ptf.get_phrase_table(), reverse, lwfile, argv, argp, ptf); |
|
} |
|
|
|
LexicalDecompositionPhraseScorer::LexicalDecompositionPhraseScorer(PhraseTable &pd, bool reverse, |
|
const String &weightfile, const char *argv[], int &argp, |
|
const PhraseScorerFactory &ptf) : |
|
PhraseScorer(pd, reverse) |
|
{ |
|
|
|
black_box_scorer = AbsoluteDiscountPhraseScorer::create_scorer(argv, argp, reverse, ptf); |
|
|
|
std::ifstream wfile(weightfile.c_str()); |
|
|
|
|
|
|
|
std::cerr<<"Reading lexical weights from '"<<weightfile<<"' ... "; |
|
|
|
while(!wfile.eof()) { |
|
if(wfile.fail()) { |
|
std::cerr << "Problem reading file: " << weightfile << std::endl; |
|
exit(1); |
|
} |
|
String src, tgt; |
|
Score weight; |
|
|
|
wfile >> src >> tgt >> weight; |
|
Count src_id = PhraseText::index_word(src); |
|
Count tgt_id = PhraseText::index_word(tgt); |
|
weight_map_.insert(std::make_pair(std::make_pair(src_id, tgt_id), weight)); |
|
} |
|
|
|
wfile.close(); |
|
std::cerr<<"done."<<std::endl; |
|
} |
|
|
|
Score |
|
LexicalDecompositionPhraseScorer::get_weight(const String &s_src, const String &s_tgt) const |
|
{ |
|
|
|
|
|
Count src = PhraseText::index_word(s_src); |
|
Count tgt = PhraseText::index_word(s_tgt); |
|
return get_weight(src, tgt); |
|
} |
|
|
|
inline Score |
|
LexicalDecompositionPhraseScorer::get_weight(Count src, Count tgt) const |
|
{ |
|
|
|
|
|
WeightMapType_::const_iterator it = weight_map_.find(std::make_pair(src, tgt)); |
|
if(it == weight_map_.end()) |
|
return 0.00001; |
|
return it->second; |
|
} |
|
|
|
void |
|
LexicalDecompositionPhraseScorer::do_score_phrases() |
|
{ |
|
|
|
|
|
|
|
black_box_scorer->score_phrases(); |
|
|
|
std::cerr<<"LexicalDecompositionPhraseScorer::do_score_phrases"<<std::endl; |
|
|
|
std::map<unsigned, std::map<unsigned, Count> > count_srclen_tgtlen; |
|
std::map<unsigned, Count> total_tgtlen; |
|
|
|
for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); it++) { |
|
const PhrasePairInfo &ppair = *it; |
|
PhraseInfo &src = phrase_table_.get_src_phrase(ppair.get_src()); |
|
PhraseInfo &tgt = phrase_table_.get_tgt_phrase(ppair.get_tgt()); |
|
unsigned src_len = src.get_phrase().size(); |
|
unsigned tgt_len = tgt.get_phrase().size(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
count_srclen_tgtlen[src_len][tgt_len]+=ppair.get_count(); |
|
total_tgtlen[tgt_len]+=ppair.get_count(); |
|
} |
|
|
|
std::map<unsigned, std::map<unsigned, Count> >::iterator its; |
|
std::map<unsigned, Count>::iterator itt; |
|
|
|
for (its=count_srclen_tgtlen.begin(); its!=count_srclen_tgtlen.end(); its++) { |
|
unsigned src_len=its->first; |
|
|
|
for(itt=its->second.begin(); itt!=its->second.end(); itt++) { |
|
unsigned tgt_len=itt->first; |
|
Count cnt=itt->second; |
|
prob_srclen_tgtlen_[src_len][tgt_len] = static_cast<Score>(cnt)/static_cast<Score>(total_tgtlen[tgt_len]); |
|
} |
|
} |
|
} |
|
|
|
Score |
|
LexicalDecompositionPhraseScorer::get_noisy_or_combination(Count src_word, PhraseInfo &tgt_phrase) |
|
{ |
|
|
|
Score sc=1.0; |
|
|
|
unsigned tgt_len=tgt_phrase.get_phrase().size(); |
|
|
|
for(unsigned i=0; i<tgt_len; i++) { |
|
Count tgt_word=tgt_phrase.get_phrase()[i]; |
|
sc *= (1.0 - get_weight(src_word, tgt_word)); |
|
} |
|
|
|
return (1.0 - sc); |
|
} |
|
|
|
Score |
|
LexicalDecompositionPhraseScorer::do_get_score(const PhraseTable::const_iterator &it) |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src()); |
|
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt()); |
|
|
|
unsigned src_len=src_phrase.get_phrase().size(); |
|
unsigned tgt_len=tgt_phrase.get_phrase().size(); |
|
|
|
Score prod=1.0; |
|
|
|
for(unsigned j=0; j<src_len; j++) |
|
prod *= get_noisy_or_combination(src_phrase.get_phrase()[j], tgt_phrase); |
|
|
|
Score lambda= static_cast<Score>(black_box_scorer->get_discount()) * |
|
tgt_phrase.get_distinct() / tgt_phrase.get_count(); |
|
|
|
Score ret_value = black_box_scorer->get_score(it) + (lambda * prod * prob_srclen_tgtlen_[src_len][tgt_len]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return ret_value; |
|
} |
|
|