|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <string> |
|
#include <fstream> |
|
#include "OnDiskWrapper.h" |
|
#include "Vocab.h" |
|
#include "moses/Util.h" |
|
#include "util/exception.hh" |
|
|
|
using namespace std; |
|
|
|
namespace OnDiskPt |
|
{ |
|
|
|
bool Vocab::Load(OnDiskWrapper &onDiskWrapper) |
|
{ |
|
fstream &file = onDiskWrapper.GetFileVocab(); |
|
|
|
string line; |
|
while(getline(file, line)) { |
|
vector<string> tokens; |
|
Moses::Tokenize(tokens, line); |
|
UTIL_THROW_IF2(tokens.size() != 2, "Vocab file corrupted"); |
|
const string &key = tokens[0]; |
|
m_vocabColl[key] = Moses::Scan<uint64_t>(tokens[1]); |
|
} |
|
|
|
|
|
|
|
m_lookup.resize(m_vocabColl.size() + 1); |
|
m_nextId = m_lookup.size(); |
|
|
|
CollType::const_iterator iter; |
|
for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) { |
|
uint32_t vocabId = iter->second; |
|
const std::string &word = iter->first; |
|
|
|
m_lookup[vocabId] = word; |
|
} |
|
|
|
return true; |
|
} |
|
|
|
void Vocab::Save(OnDiskWrapper &onDiskWrapper) |
|
{ |
|
fstream &file = onDiskWrapper.GetFileVocab(); |
|
CollType::const_iterator iterVocab; |
|
for (iterVocab = m_vocabColl.begin(); iterVocab != m_vocabColl.end(); ++iterVocab) { |
|
const string &word = iterVocab->first; |
|
uint32_t vocabId = iterVocab->second; |
|
|
|
file << word << " " << vocabId << endl; |
|
} |
|
} |
|
|
|
uint64_t Vocab::AddVocabId(const std::string &str) |
|
{ |
|
|
|
CollType::const_iterator iter = m_vocabColl.find(str); |
|
if (iter == m_vocabColl.end()) { |
|
|
|
m_vocabColl[str] = m_nextId; |
|
return m_nextId++; |
|
} else { |
|
|
|
return iter->second; |
|
} |
|
} |
|
|
|
uint64_t Vocab::GetVocabId(const std::string &str, bool &found) const |
|
{ |
|
|
|
CollType::const_iterator iter = m_vocabColl.find(str); |
|
if (iter == m_vocabColl.end()) { |
|
found = false; |
|
return 0; |
|
} else { |
|
|
|
found = true; |
|
return iter->second; |
|
} |
|
} |
|
|
|
} |
|
|