NMTKD
/
translation
/tools
/mosesdecoder
/moses
/TranslationModel
/RuleTable
/PhraseDictionaryFuzzyMatch.cpp
// vim:tabstop=2 | |
/*********************************************************************** | |
Moses - factored phrase-based language decoder | |
Copyright (C) 2006 University of Edinburgh | |
This library is free software; you can redistribute it and/or | |
modify it under the terms of the GNU Lesser General Public | |
License as published by the Free Software Foundation; either | |
version 2.1 of the License, or (at your option) any later version. | |
This library is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public | |
License along with this library; if not, write to the Free Software | |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
***********************************************************************/ | |
using namespace std; | |
char *mkdtemp(char *tempbuf) | |
{ | |
int rand_value = 0; | |
char* tempbase = NULL; | |
char tempbasebuf[MAX_PATH] = ""; | |
if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) { | |
errno = EINVAL; | |
return NULL; | |
} | |
util::rand_init(); | |
rand_value = util::rand_excl(1e6); | |
tempbase = strrchr(tempbuf, '/'); | |
tempbase = tempbase ? tempbase+1 : tempbuf; | |
strcpy(tempbasebuf, tempbase); | |
sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value); | |
::GetTempPath(MAX_PATH, tempbuf); | |
strcat(tempbuf, tempbasebuf); | |
::CreateDirectory(tempbuf, NULL); | |
return tempbuf; | |
} | |
namespace Moses | |
{ | |
PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line) | |
:PhraseDictionary(line, true) | |
,m_config(3) | |
,m_FuzzyMatchWrapper(NULL) | |
{ | |
ReadParameters(); | |
} | |
PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch() | |
{ | |
delete m_FuzzyMatchWrapper; | |
} | |
void PhraseDictionaryFuzzyMatch::Load(AllOptions::ptr const& opts) | |
{ | |
m_options = opts; | |
SetFeaturesToApply(); | |
m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]); | |
} | |
ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager( | |
const ChartParser &parser, | |
const ChartCellCollectionBase &cellCollection, | |
std::size_t /*maxChartSpan*/) | |
{ | |
return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this); | |
} | |
void | |
PhraseDictionaryFuzzyMatch:: | |
SetParameter(const std::string& key, const std::string& value) | |
{ | |
if (key == "source") { | |
m_config[0] = value; | |
} else if (key == "target") { | |
m_config[1] = value; | |
} else if (key == "alignment") { | |
m_config[2] = value; | |
} else { | |
PhraseDictionary::SetParameter(key, value); | |
} | |
} | |
int removedirectoryrecursively(const char *dirname) | |
{ | |
//TODO(jie): replace this function with boost implementation | |
DIR *dir; | |
struct dirent *entry; | |
char path[PATH_MAX]; | |
dir = opendir(dirname); | |
if (dir == NULL) { | |
perror("Error opendir()"); | |
return 0; | |
} | |
while ((entry = readdir(dir)) != NULL) { | |
if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) { | |
snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name); | |
if (entry->d_type == DT_DIR) { | |
removedirectoryrecursively(path); | |
} | |
remove(path); | |
/* | |
* Here, the actual deletion must be done. Beacuse this is | |
* quite a dangerous thing to do, and this program is not very | |
* well tested, we are just printing as if we are deleting. | |
*/ | |
//printf("(not really) Deleting: %s\n", path); | |
/* | |
* When you are finished testing this and feel you are ready to do the real | |
* deleting, use this: remove*STUB*(path); | |
* (see "man 3 remove") | |
* Please note that I DONT TAKE RESPONSIBILITY for data you delete with this! | |
*/ | |
} | |
} | |
closedir(dir); | |
rmdir(dirname); | |
/* | |
* Now the directory is emtpy, finally delete the directory itself. (Just | |
* printing here, see above) | |
*/ | |
//printf("(not really) Deleting: %s\n", dirname); | |
return 1; | |
} | |
void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask) | |
{ | |
InputType const& inputSentence = *ttask->GetSource(); | |
char dirName[] = "moses.XXXXXX"; | |
char dirName[] = "/tmp/moses.XXXXXX"; | |
char *temp = mkdtemp(dirName); | |
UTIL_THROW_IF2(temp == NULL, | |
"Couldn't create temporary directory " << dirName); | |
string dirNameStr(dirName); | |
string inFileName(dirNameStr + "/in"); | |
ofstream inFile(inFileName.c_str()); | |
for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { | |
inFile << inputSentence.GetWord(i); | |
} | |
inFile << endl; | |
inFile.close(); | |
long translationId = inputSentence.GetTranslationId(); | |
string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); | |
// populate with rules for this sentence | |
PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; | |
FormatType format = MosesFormat; | |
// data from file | |
InputFileStream inStream(ptFileName); | |
// copied from class LoaderStandard | |
PrintUserTime("Start loading fuzzy-match phrase model"); | |
const StaticData &staticData = StaticData::Instance(); | |
string lineOrig; | |
size_t count = 0; | |
while(getline(inStream, lineOrig)) { | |
const string *line; | |
if (format == HieroFormat) { // reformat line | |
UTIL_THROW(util::Exception, "Cannot be Hiero format"); | |
//line = ReformatHieroRule(lineOrig); | |
} else { | |
// do nothing to format of line | |
line = &lineOrig; | |
} | |
vector<string> tokens; | |
vector<float> scoreVector; | |
TokenizeMultiCharSeparator(tokens, *line , "|||" ); | |
if (tokens.size() != 4 && tokens.size() != 5) { | |
UTIL_THROW2("Syntax error at " << ptFileName << ":" << count); | |
} | |
const string &sourcePhraseString = tokens[0] | |
, &targetPhraseString = tokens[1] | |
, &scoreString = tokens[2] | |
, &alignString = tokens[3]; | |
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); | |
if (isLHSEmpty && !ttask->options()->unk.word_deletion_enabled) { | |
TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); | |
continue; | |
} | |
Tokenize<float>(scoreVector, scoreString); | |
const size_t numScoreComponents = GetNumScoreComponents(); | |
if (scoreVector.size() != numScoreComponents) { | |
UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" | |
<< numScoreComponents << ") of score components on line " << count); | |
} | |
UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, | |
"Number of scores incorrectly specified"); | |
// parse source & find pt node | |
// constituent labels | |
Word *sourceLHS; | |
Word *targetLHS; | |
// source | |
Phrase sourcePhrase( 0); | |
sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS); | |
// create target phrase obj | |
TargetPhrase *targetPhrase = new TargetPhrase(this); | |
targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS); | |
// rest of target phrase | |
targetPhrase->SetAlignmentInfo(alignString); | |
targetPhrase->SetTargetLHS(targetLHS); | |
//targetPhrase->SetDebugOutput(string("New Format pt ") + line); | |
// component score, for n-best output | |
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); | |
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); | |
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); | |
targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); | |
TargetPhraseCollection::shared_ptr phraseColl | |
= GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, | |
*targetPhrase, sourceLHS); | |
phraseColl->Add(targetPhrase); | |
count++; | |
if (format == HieroFormat) { // reformat line | |
delete line; | |
} else { | |
// do nothing | |
} | |
} | |
// sort and prune each target phrase collection | |
SortAndPrune(rootNode); | |
//removedirectoryrecursively(dirName); | |
} | |
TargetPhraseCollection::shared_ptr | |
PhraseDictionaryFuzzyMatch:: | |
GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode | |
, const Phrase &source | |
, const TargetPhrase &target | |
, const Word *sourceLHS) | |
{ | |
PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS); | |
return currNode.GetTargetPhraseCollection(); | |
} | |
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode | |
, const Phrase &source | |
, const TargetPhrase &target | |
, const Word *sourceLHS) | |
{ | |
cerr << source << endl << target << endl; | |
const size_t size = source.GetSize(); | |
const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); | |
AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin(); | |
PhraseDictionaryNodeMemory *currNode = &rootNode; | |
for (size_t pos = 0 ; pos < size ; ++pos) { | |
const Word& word = source.GetWord(pos); | |
if (word.IsNonTerminal()) { | |
// indexed by source label 1st | |
const Word &sourceNonTerm = word; | |
UTIL_THROW_IF2(iterAlign == alignmentInfo.end(), | |
"No alignment for non-term at position " << pos); | |
UTIL_THROW_IF2(iterAlign->first != pos, | |
"Alignment info incorrect at position " << pos); | |
size_t targetNonTermInd = iterAlign->second; | |
++iterAlign; | |
const Word &targetNonTerm = target.GetWord(targetNonTermInd); | |
currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm); | |
currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm); | |
} else { | |
currNode = currNode->GetOrCreateChild(word); | |
} | |
UTIL_THROW_IF2(currNode == NULL, | |
"Node not found at position " << pos); | |
} | |
// finally, the source LHS | |
//currNode = currNode->GetOrCreateChild(sourceLHS); | |
return *currNode; | |
} | |
void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode) | |
{ | |
if (GetTableLimit()) { | |
rootNode.Sort(GetTableLimit()); | |
} | |
} | |
void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source) | |
{ | |
m_collection.erase(source.GetTranslationId()); | |
} | |
const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const | |
{ | |
std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(translationId); | |
UTIL_THROW_IF2(iter == m_collection.end(), | |
"Couldn't find root node for input: " << translationId); | |
return iter->second; | |
} | |
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) | |
{ | |
long transId = source.GetTranslationId(); | |
std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId); | |
UTIL_THROW_IF2(iter == m_collection.end(), | |
"Couldn't find root node for input: " << transId); | |
return iter->second; | |
} | |
TO_STRING_BODY(PhraseDictionaryFuzzyMatch); | |
// friend | |
ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict) | |
{ | |
/* | |
typedef PhraseDictionaryNodeMemory::TerminalMap TermMap; | |
typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap; | |
const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection; | |
for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) { | |
const Word &sourceNonTerm = p->first.first; | |
out << sourceNonTerm; | |
} | |
for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) { | |
const Word &sourceTerm = p->first; | |
out << sourceTerm; | |
} | |
*/ | |
return out; | |
} | |
} | |