|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <algorithm> |
|
#include <iostream> |
|
#include <string> |
|
#include <vector> |
|
#include <iterator> |
|
#include <cassert> |
|
#include "moses/InputFileStream.h" |
|
#include "moses/Timer.h" |
|
#include "moses/Util.h" |
|
#include "OnDiskWrapper.h" |
|
#include "SourcePhrase.h" |
|
#include "TargetPhrase.h" |
|
#include "TargetPhraseCollection.h" |
|
#include "Word.h" |
|
#include "Vocab.h" |
|
#include "Main.h" |
|
|
|
using namespace std; |
|
using namespace OnDiskPt; |
|
|
|
int main (int argc, char * const argv[]) |
|
{ |
|
|
|
Moses::ResetUserTime(); |
|
Moses::PrintUserTime("Starting"); |
|
|
|
if (argc != 8) { |
|
std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl; |
|
return 1; |
|
} |
|
|
|
int numSourceFactors = Moses::Scan<int>(argv[1]) |
|
, numTargetFactors = Moses::Scan<int>(argv[2]) |
|
, numScores = Moses::Scan<int>(argv[3]) |
|
, tableLimit = Moses::Scan<int>(argv[4]); |
|
TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]); |
|
assert(TargetPhraseCollection::s_sortScoreInd < numScores); |
|
|
|
const string filePath = argv[6] |
|
,destPath = argv[7]; |
|
|
|
Moses::InputFileStream inStream(filePath); |
|
|
|
OnDiskWrapper onDiskWrapper; |
|
onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores); |
|
|
|
PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode(); |
|
size_t lineNum = 0; |
|
string line; |
|
|
|
while(getline(inStream, line)) { |
|
lineNum++; |
|
if (lineNum%1000 == 0) cerr << "." << flush; |
|
if (lineNum%10000 == 0) cerr << ":" << flush; |
|
if (lineNum%100000 == 0) cerr << lineNum << flush; |
|
|
|
|
|
std::vector<float> misc(1); |
|
SourcePhrase sourcePhrase; |
|
TargetPhrase *targetPhrase = new TargetPhrase(numScores); |
|
OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc); |
|
assert(misc.size() == onDiskWrapper.GetNumCounts()); |
|
|
|
rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort); |
|
} |
|
|
|
rootNode.Save(onDiskWrapper, 0, tableLimit); |
|
onDiskWrapper.EndSave(); |
|
|
|
Moses::PrintUserTime("Finished"); |
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::SourcePhrase *currSourcePhrase) |
|
{ |
|
if (prevSourcePhrase == NULL) |
|
return false; |
|
|
|
assert(currSourcePhrase); |
|
bool ret = (*currSourcePhrase > *prevSourcePhrase); |
|
|
|
|
|
return ret; |
|
} |
|
|
|
OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) |
|
{ |
|
char line[lineStr.size() + 1]; |
|
strcpy(line, lineStr.c_str()); |
|
|
|
stringstream sparseFeatures, property; |
|
|
|
size_t scoreInd = 0; |
|
|
|
|
|
size_t stage = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
char *tok = strtok (line," "); |
|
OnDiskPt::PhrasePtr out(new Phrase()); |
|
while (tok != NULL) { |
|
if (0 == strcmp(tok, "|||")) { |
|
++stage; |
|
} else { |
|
switch (stage) { |
|
case 0: { |
|
WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1); |
|
if (w != NULL) |
|
out->AddWord(w); |
|
|
|
break; |
|
} |
|
case 1: { |
|
Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0); |
|
break; |
|
} |
|
case 2: { |
|
float score = Moses::Scan<float>(tok); |
|
targetPhrase.SetScore(score, scoreInd); |
|
++scoreInd; |
|
break; |
|
} |
|
case 3: { |
|
|
|
targetPhrase.CreateAlignFromString(tok); |
|
break; |
|
} |
|
case 4: { |
|
|
|
float val = Moses::Scan<float>(tok); |
|
misc[0] = val; |
|
break; |
|
} |
|
case 5: { |
|
|
|
sparseFeatures << tok << " "; |
|
break; |
|
} |
|
case 6: { |
|
property << tok << " "; |
|
break; |
|
} |
|
default: |
|
cerr << "ERROR in line " << line << endl; |
|
assert(false); |
|
break; |
|
} |
|
} |
|
|
|
tok = strtok (NULL, " "); |
|
} |
|
|
|
assert(scoreInd == numScores); |
|
targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str())); |
|
targetPhrase.SetProperty(Moses::Trim(property.str())); |
|
targetPhrase.SortAlign(); |
|
return out; |
|
} |
|
|
|
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase |
|
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm |
|
, OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget) |
|
{ |
|
|
|
|
|
bool nonTerm = false; |
|
size_t tokSize = token.size(); |
|
int comStr =token.compare(0, 1, "["); |
|
|
|
if (comStr == 0) { |
|
comStr = token.compare(tokSize - 1, 1, "]"); |
|
nonTerm = comStr == 0; |
|
} |
|
|
|
OnDiskPt::WordPtr out; |
|
if (nonTerm) { |
|
|
|
size_t splitPos = token.find_first_of("[", 2); |
|
string wordStr = token.substr(0, splitPos); |
|
|
|
if (splitPos == string::npos) { |
|
|
|
WordPtr word(new Word()); |
|
word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); |
|
phrase.AddWord(word); |
|
} else { |
|
|
|
if (addSourceNonTerm) { |
|
WordPtr word(new Word()); |
|
word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); |
|
phrase.AddWord(word); |
|
|
|
if (retSourceTarget == 1) { |
|
out = word; |
|
} |
|
} |
|
|
|
wordStr = token.substr(splitPos, tokSize - splitPos); |
|
if (addTargetNonTerm) { |
|
WordPtr word(new Word()); |
|
word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); |
|
phrase.AddWord(word); |
|
|
|
if (retSourceTarget == 2) { |
|
out = word; |
|
} |
|
} |
|
|
|
} |
|
} else { |
|
|
|
WordPtr word(new Word()); |
|
word->CreateFromString(token, onDiskWrapper.GetVocab()); |
|
phrase.AddWord(word); |
|
out = word; |
|
} |
|
|
|
return out; |
|
} |
|
|
|
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const ::AlignType &alignments) |
|
{ |
|
for (int ind = alignments.size() - 1; ind >= 0; --ind) { |
|
const ::AlignPair &alignPair = alignments[ind]; |
|
size_t sourcePos = alignPair.first |
|
,targetPos = alignPair.second; |
|
|
|
const string &target = targetToks[targetPos]; |
|
sourceToks.insert(sourceToks.begin() + sourcePos + 1, target); |
|
|
|
} |
|
} |
|
|
|
class AlignOrderer |
|
{ |
|
public: |
|
bool operator()(const ::AlignPair &a, const ::AlignPair &b) const { |
|
return a.first < b.first; |
|
} |
|
}; |
|
|
|
void SortAlign(::AlignType &alignments) |
|
{ |
|
std::sort(alignments.begin(), alignments.end(), AlignOrderer()); |
|
} |
|
|