// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2007-2013 Ulrich Germann #include #include #include #include #include #include #include "tpt_tokenindex.h" #include "ug_typedefs.h" using namespace std; namespace sapt { TokenIndex:: TokenIndex(string unkToken) : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0) , startIdx(0), endIdx(0) { lock.reset(new boost::mutex()); }; #if 0 TokenIndex:: TokenIndex(string fname, string unkToken,bool dyna) : ridx(0),unkLabel(unkToken) { this->open(fname,unkToken,dyna); }; #endif void TokenIndex:: open(string fname, string unkToken,bool dyna) { if (access(fname.c_str(),F_OK)) { ostringstream msg; msg << "TokenIndex::open: File '" << fname << "' does not exist."; throw std::runtime_error(msg.str().c_str()); } file.open(fname); if (!file.is_open()) { ostringstream msg; msg << "TokenIndex::open: Error opening file '" << fname << "'."; throw std::runtime_error(msg.str().c_str()); } this->numTokens = *(reinterpret_cast(file.data())); unkId = *(reinterpret_cast(file.data()+4)); startIdx = reinterpret_cast(file.data()+4+sizeof(id_type)); endIdx = startIdx + numTokens; comp.base = reinterpret_cast(endIdx); if (!unkToken.empty()) { Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp); unkId = ((bla < endIdx && unkToken == comp.base+bla->offset) ? bla->id : numTokens); } this->dynamic=dyna; if (dyna) { this->str2idExtra.reset(new map()); this->newWords.reset(new vector()); } } void TokenIndex:: close() { file.close(); } TokenIndex:: CompFunc:: CompFunc() {}; bool TokenIndex:: CompFunc:: operator()(Entry const& A, char const* w) { return strcmp(base+A.offset,w) < 0; }; id_type TokenIndex:: operator[](char const* p) const { if (startIdx != endIdx) { Entry const* bla = lower_bound(startIdx,endIdx,p,comp); if (bla != endIdx && !strcmp(comp.base+bla->offset,p)) return bla->id; if (!dynamic) return unkId; } else if (!dynamic) return strcmp(p,"NULL") && unkId; boost::lock_guard lk(*this->lock); // stuff below is new as of 2011-01-30, for dynamic adding of // unknown items IMPORTANT: numTokens is not currently not // changed, it is the number of PRE-EXISING TOKENS, not including // dynamically added Items // if (!str2idExtra) // { // this->str2idExtra.reset(new map()); // this->newWords.reset(new vector()); // } map::value_type newItem(p,str2idExtra->size()+numTokens); pair::iterator,bool> foo = str2idExtra->insert(newItem); if (foo.second) // it actually is a new item newWords->push_back(foo.first->first); return foo.first->second; } id_type TokenIndex:: operator[](string const& w) const { return (*this)[w.c_str()]; } vector TokenIndex:: reverseIndex() const { size_t numToks = endIdx-startIdx; // cout << "tokenindex has " << numToks << " tokens" << endl; vector v(numToks,NULL); // v.reserve(endIdx-startIdx); for (Entry const* x = startIdx; x != endIdx; x++) { if (x->id >= v.size()) v.resize(x->id+1); v[x->id] = comp.base+x->offset; } // cout << "done reversing index " << endl; return v; } char const* const TokenIndex:: operator[](id_type id) const { if (!ridx.size()) { boost::lock_guard lk(*this->lock); // Someone else (multi-threading!) may have created the // reverse index in the meantime, so let's check again if (!ridx.size()) ridx = reverseIndex(); } if (id < ridx.size()) return ridx[id]; boost::lock_guard lk(*this->lock); if (dynamic && id < ridx.size()+newWords->size()) return (*newWords)[id-ridx.size()].c_str(); return unkLabel.c_str(); } void TokenIndex:: iniReverseIndex() { if (!ridx.size()) { boost::lock_guard lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } } char const* const TokenIndex:: operator[](id_type id) { if (!ridx.size()) { boost::lock_guard lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } if (id < ridx.size()) return ridx[id]; boost::lock_guard lk(*this->lock); if (dynamic && id < ridx.size()+newWords->size()) return (*newWords)[id-ridx.size()].c_str(); return unkLabel.c_str(); } string TokenIndex:: toString(vector const& v) { if (!ridx.size()) { boost::lock_guard lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } ostringstream buf; for (size_t i = 0; i < v.size(); i++) buf << (i ? " " : "") << (*this)[v[i]]; return buf.str(); } string TokenIndex:: toString(vector const& v) const { if (!ridx.size()) { boost::lock_guard lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } ostringstream buf; for (size_t i = 0; i < v.size(); i++) buf << (i ? " " : "") << (*this)[v[i]]; return buf.str(); } string TokenIndex:: toString(id_type const* start, id_type const* const stop) { if (!ridx.size()) { boost::lock_guard lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } ostringstream buf; if (start < stop) buf << (*this)[*start]; while (++start < stop) buf << " " << (*this)[*start]; return buf.str(); } string TokenIndex:: toString(id_type const* start, id_type const* const stop) const { if (!ridx.size()) { boost::lock_guard lk(*this->lock); if (!ridx.size()) ridx = reverseIndex(); } ostringstream buf; if (start < stop) buf << (*this)[*start]; while (++start < stop) buf << " " << (*this)[*start]; return buf.str(); } vector TokenIndex:: toIdSeq(string const& line) const { istringstream buf(line); string w; vector retval; while (buf>>w) retval.push_back((*this)[w]); return retval; } /// Return false if line contains unknown tokens, true otherwise bool TokenIndex:: fillIdSeq(string const& line, vector & v) const { bool allgood = true; string w; v.clear(); for (istringstream buf(line); buf>>w;) { v.push_back((*this)[w]); allgood = allgood && v.back() > 1; } return allgood; } id_type TokenIndex:: getNumTokens() const { return numTokens; } id_type TokenIndex:: getUnkId() const { return unkId; } char const* const TokenIndex:: getUnkToken() const { return unkLabel.c_str(); // return (*this)[unkId]; } id_type TokenIndex:: knownVocabSize() const { return numTokens; } id_type TokenIndex:: ksize() const { return numTokens; } id_type TokenIndex:: totalVocabSize() const { return tsize(); } id_type TokenIndex:: tsize() const { return (newWords != NULL ? numTokens+newWords->size() : numTokens); } void write_tokenindex_to_disk(vector > const& tok, string const& ofile, string const& unkToken) { typedef pair IndexEntry; // offset and id // Write token strings to a buffer, keep track of offsets vector index(tok.size()); ostringstream data; id_type unkId = tok.size(); for (size_t i = 0; i < tok.size(); i++) { if (tok[i].first == unkToken) unkId = tok[i].second; index[i].first = data.tellp(); // offset of string index[i].second = tok[i].second; // respective ID data<(&vsize),4); out.write(reinterpret_cast(&unkId),sizeof(id_type)); for (size_t i = 0; i < index.size(); i++) { out.write(reinterpret_cast(&index[i].first),4); out.write(reinterpret_cast(&index[i].second),sizeof(id_type)); } out< Token; // token and id vector tok(totalVocabSize()); for (id_type i = 0; i < tok.size(); ++i) tok[i] = Token((*this)[i],i); sort(tok.begin(),tok.end()); write_tokenindex_to_disk(tok,fname,unkLabel); } bool TokenIndex:: isDynamic() const { return dynamic; } bool TokenIndex:: setDynamic(bool on) { bool ret = dynamic; if (on && this->str2idExtra == NULL) { this->str2idExtra.reset(new map()); this->newWords.reset(new vector()); } dynamic = on; if (on) { (*this)["NULL"]; (*this)[unkLabel]; } return ret; } void TokenIndex:: setUnkLabel(string unk) { unkId = (*this)[unk]; unkLabel = unk; } }