|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef moses_FuzzyMatchWrapper_h |
|
#define moses_FuzzyMatchWrapper_h |
|
|
|
#ifdef WITH_THREADS |
|
#include <boost/thread/shared_mutex.hpp> |
|
#endif |
|
|
|
#include <fstream> |
|
#include <string> |
|
#include "SuffixArray.h" |
|
#include "Vocabulary.h" |
|
#include "Match.h" |
|
#include "moses/InputType.h" |
|
|
|
namespace tmmt |
|
{ |
|
class Match; |
|
struct SentenceAlignment; |
|
|
|
class FuzzyMatchWrapper |
|
{ |
|
public: |
|
FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment); |
|
|
|
std::string Extract(long translationId, const std::string &dirNameStr); |
|
|
|
protected: |
|
|
|
std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment; |
|
tmmt::SuffixArray *suffixArray; |
|
int basic_flag; |
|
int lsed_flag; |
|
int refined_flag; |
|
int length_filter_flag; |
|
int parse_flag; |
|
int min_match; |
|
int multiple_flag; |
|
int multiple_slack; |
|
int multiple_max; |
|
|
|
typedef std::map< WORD_ID,std::vector< int > > WordIndex; |
|
|
|
|
|
std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed; |
|
#ifdef WITH_THREADS |
|
|
|
mutable boost::shared_mutex m_accessLock; |
|
#endif |
|
|
|
void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus ); |
|
void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus); |
|
void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus ); |
|
|
|
|
|
void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source, |
|
std::vector< std::vector< tmmt::WORD_ID > > input ) ; |
|
|
|
|
|
|
|
unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence ); |
|
unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx ); |
|
unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed ); |
|
void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input ); |
|
int short_match_max_length( int input_length ); |
|
void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost ); |
|
std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost ); |
|
int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost ); |
|
|
|
void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile); |
|
|
|
std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath); |
|
Vocabulary &GetVocabulary() { |
|
return suffixArray->GetVocabulary(); |
|
} |
|
|
|
bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const; |
|
void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value); |
|
|
|
}; |
|
|
|
} |
|
|
|
#endif |
|
|