|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <map> |
|
#include <ostream> |
|
#include <set> |
|
#include <string> |
|
#include <vector> |
|
|
|
#include "OutputFileStream.h" |
|
#include "SyntaxTree.h" |
|
|
|
#include "syntax-common/tool.h" |
|
|
|
namespace MosesTraining |
|
{ |
|
namespace Syntax |
|
{ |
|
namespace GHKM |
|
{ |
|
|
|
struct Options; |
|
|
|
class ExtractGHKM : public Tool |
|
{ |
|
public: |
|
ExtractGHKM() : Tool("extract-ghkm") {} |
|
|
|
virtual int Main(int argc, char *argv[]); |
|
|
|
private: |
|
void RecordTreeLabels(const SyntaxTree &, std::set<std::string> &); |
|
void CollectWordLabelCounts(SyntaxTree &, |
|
const Options &, |
|
std::map<std::string, int> &, |
|
std::map<std::string, std::string> &); |
|
void WriteUnknownWordLabel(const std::map<std::string, int> &, |
|
const std::map<std::string, std::string> &, |
|
const Options &, |
|
std::ostream &, |
|
bool writeCounts=false) const; |
|
void WriteUnknownWordSoftMatches(const std::set<std::string> &, |
|
std::ostream &) const; |
|
void WriteGlueGrammar(const std::set<std::string> &, |
|
const std::map<std::string, int> &, |
|
const std::map<std::string,size_t> &, |
|
const Options &, |
|
std::ostream &) const; |
|
void WriteSourceLabelSet(const std::map<std::string,size_t> &, |
|
std::ostream &) const; |
|
void StripBitParLabels(const std::set<std::string> &labelSet, |
|
const std::map<std::string, int> &topLabelSet, |
|
std::set<std::string> &outLabelSet, |
|
std::map<std::string, int> &outTopLabelSet) const; |
|
|
|
std::vector<std::string> ReadTokens(const std::string &) const; |
|
std::vector<std::string> ReadTokens(const SyntaxTree &root) const; |
|
|
|
void ProcessOptions(int, char *[], Options &) const; |
|
}; |
|
|
|
} |
|
} |
|
} |
|
|