|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef __ugTokenIndex_hh |
|
#define __ugTokenIndex_hh |
|
#include <iostream> |
|
#include <sstream> |
|
#include <fstream> |
|
#include <boost/iostreams/device/mapped_file.hpp> |
|
#include <boost/iostreams/stream.hpp> |
|
#include <boost/shared_ptr.hpp> |
|
#include <boost/scoped_ptr.hpp> |
|
#include <boost/thread.hpp> |
|
#include "tpt_typedefs.h" |
|
#include <vector> |
|
#include <map> |
|
|
|
namespace bio=boost::iostreams; |
|
|
|
namespace sapt |
|
{ |
|
class TokenIndex |
|
{ |
|
typedef tpt::id_type id_type; |
|
|
|
mutable std::vector<char const*> ridx; |
|
|
|
std::string unkLabel; |
|
id_type unkId,numTokens; |
|
|
|
|
|
boost::scoped_ptr<boost::mutex> lock; |
|
|
|
|
|
bool dynamic; |
|
boost::shared_ptr<std::map<std::string, tpt::id_type> > str2idExtra; |
|
boost::shared_ptr<std::vector<std::string> > newWords; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public: |
|
|
|
class Entry |
|
{ |
|
public: |
|
uint32_t offset; |
|
id_type id; |
|
}; |
|
|
|
|
|
class CompFunc |
|
{ |
|
public: |
|
char const* base; |
|
CompFunc(); |
|
bool operator()(Entry const& A, char const* w); |
|
}; |
|
|
|
bio::mapped_file_source file; |
|
Entry const* startIdx; |
|
Entry const* endIdx; |
|
CompFunc comp; |
|
TokenIndex(std::string unkToken="UNK"); |
|
|
|
void open(std::string fname,std::string unkToken="UNK",bool dyna=false); |
|
void close(); |
|
|
|
id_type operator[](char const* w) const; |
|
id_type operator[](std::string const& w) const; |
|
char const* const operator[](id_type id) const; |
|
char const* const operator[](id_type id); |
|
std::vector<char const*> reverseIndex() const; |
|
|
|
std::string toString(std::vector<id_type> const& v); |
|
std::string toString(std::vector<id_type> const& v) const; |
|
|
|
std::string toString(id_type const* start, id_type const* const stop); |
|
std::string toString(id_type const* start, id_type const* const stop) const; |
|
|
|
std::vector<id_type> toIdSeq(std::string const& line) const; |
|
|
|
bool fillIdSeq(std::string const& line, std::vector<id_type> & v) const; |
|
|
|
void iniReverseIndex(); |
|
id_type getNumTokens() const; |
|
id_type getUnkId() const; |
|
|
|
|
|
id_type knownVocabSize() const; |
|
id_type totalVocabSize() const; |
|
|
|
id_type ksize() const; |
|
id_type tsize() const; |
|
|
|
|
|
char const* const getUnkToken() const; |
|
|
|
void write(std::string fname); |
|
bool isDynamic() const; |
|
bool setDynamic(bool onoff); |
|
|
|
void setUnkLabel(std::string unk); |
|
}; |
|
|
|
void |
|
write_tokenindex_to_disk(std::vector<std::pair<std::string,uint32_t> > const& tok, |
|
std::string const& ofile, std::string const& unkToken); |
|
|
|
|
|
class compWords |
|
{ |
|
std::string unk; |
|
public: |
|
compWords(std::string _unk) : unk(_unk) {}; |
|
|
|
bool |
|
operator()(std::pair<std::string,size_t> const& A, |
|
std::pair<std::string,size_t> const& B) const |
|
{ |
|
if (A.first == unk) return false; |
|
if (B.first == unk) return true; |
|
if (A.second == B.second) |
|
return A.first < B.first; |
|
return A.second > B.second; |
|
} |
|
}; |
|
|
|
template<class MYMAP> |
|
void |
|
mkTokenIndex(std::string ofile,MYMAP const& M,std::string unkToken) |
|
{ |
|
|
|
typedef std::pair<std::string,uint32_t> Token; |
|
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string,size_t> > wcounts(M.size()); |
|
typedef typename MYMAP::const_iterator myIter; |
|
size_t z=0; |
|
for (myIter m = M.begin(); m != M.end(); m++) |
|
{ |
|
|
|
wcounts[z++] = std::pair<std::string,size_t>(m->first,m->second); |
|
} |
|
compWords compFunc(unkToken); |
|
sort(wcounts.begin(),wcounts.end(),compFunc); |
|
|
|
|
|
std::vector<Token> tok(wcounts.size()); |
|
for (size_t i = 0; i < wcounts.size(); i++) |
|
tok[i] = Token(wcounts[i].first,i); |
|
|
|
sort(tok.begin(),tok.end()); |
|
write_tokenindex_to_disk(tok,ofile,unkToken); |
|
} |
|
|
|
template<typename Token> |
|
void |
|
fill_token_seq(TokenIndex& V, std::string const& line, std::vector<Token>& dest) |
|
{ |
|
std::istringstream buf(line); std::string w; |
|
while (buf>>w) dest.push_back(Token(V[w])); |
|
} |
|
} |
|
#endif |
|
|