|
#include "Vocabulary.h" |
|
|
|
#pragma once |
|
|
|
#define LINE_MAX_LENGTH 10000 |
|
|
|
namespace tmmt |
|
{ |
|
|
|
class SuffixArray |
|
{ |
|
public: |
|
typedef unsigned int INDEX; |
|
|
|
private: |
|
std::vector< std::vector< WORD_ID > > corpus; |
|
|
|
WORD_ID *m_array; |
|
INDEX *m_index; |
|
INDEX *m_buffer; |
|
char *m_wordInSentence; |
|
size_t *m_sentence; |
|
char *m_sentenceLength; |
|
WORD_ID m_endOfSentence; |
|
Vocabulary m_vcb; |
|
INDEX m_size; |
|
|
|
public: |
|
SuffixArray( std::string fileName ); |
|
~SuffixArray(); |
|
|
|
void Sort(INDEX start, INDEX end); |
|
int CompareIndex( INDEX a, INDEX b ) const; |
|
inline int CompareWord( WORD_ID a, WORD_ID b ) const; |
|
int Count( const std::vector< WORD > &phrase ); |
|
bool MinCount( const std::vector< WORD > &phrase, INDEX min ); |
|
bool Exists( const std::vector< WORD > &phrase ); |
|
int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); |
|
int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); |
|
INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); |
|
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); |
|
int Match( const std::vector< WORD > &phrase, INDEX index ); |
|
void List( INDEX start, INDEX end ); |
|
inline INDEX GetPosition( INDEX index ) { |
|
return m_index[ index ]; |
|
} |
|
inline size_t GetSentence( INDEX position ) { |
|
return m_sentence[position]; |
|
} |
|
inline char GetWordInSentence( INDEX position ) { |
|
return m_wordInSentence[position]; |
|
} |
|
inline char GetSentenceLength( size_t sentenceId ) { |
|
return m_sentenceLength[sentenceId]; |
|
} |
|
inline INDEX GetSize() { |
|
return m_size; |
|
} |
|
|
|
Vocabulary &GetVocabulary() { |
|
return m_vcb; |
|
} |
|
const std::vector< std::vector< WORD_ID > > &GetCorpus() const { |
|
return corpus; |
|
} |
|
}; |
|
|
|
} |
|
|
|
|