|
|
|
|
|
#ifndef DLIB_CPP_TOKENIZER_KERNEl_1_ |
|
#define DLIB_CPP_TOKENIZER_KERNEl_1_ |
|
|
|
#include <string> |
|
#include <iostream> |
|
#include "cpp_tokenizer_kernel_abstract.h" |
|
#include "../algs.h" |
|
|
|
namespace dlib |
|
{ |
|
|
|
namespace cpp_tok_kernel_1_helper |
|
{ |
|
struct token_text_pair |
|
{ |
|
std::string token; |
|
int type=0; |
|
}; |
|
|
|
} |
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
class cpp_tokenizer_kernel_1 |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef cpp_tok_kernel_1_helper::token_text_pair token_text_pair; |
|
|
|
public: |
|
|
|
enum |
|
{ |
|
END_OF_FILE, |
|
KEYWORD, |
|
COMMENT, |
|
SINGLE_QUOTED_TEXT, |
|
DOUBLE_QUOTED_TEXT, |
|
IDENTIFIER, |
|
OTHER, |
|
NUMBER, |
|
WHITE_SPACE |
|
}; |
|
|
|
cpp_tokenizer_kernel_1 ( |
|
); |
|
|
|
virtual ~cpp_tokenizer_kernel_1 ( |
|
); |
|
|
|
void clear( |
|
); |
|
|
|
void set_stream ( |
|
std::istream& in |
|
); |
|
|
|
bool stream_is_set ( |
|
) const; |
|
|
|
std::istream& get_stream ( |
|
) const; |
|
|
|
void get_token ( |
|
int& type, |
|
std::string& token |
|
); |
|
|
|
int peek_type ( |
|
) const; |
|
|
|
const std::string& peek_token ( |
|
) const; |
|
|
|
void swap ( |
|
cpp_tokenizer_kernel_1<tok,queue,set>& item |
|
); |
|
|
|
private: |
|
|
|
void buffer_token( |
|
int type, |
|
const std::string& token |
|
) |
|
|
|
|
|
|
|
|
|
{ |
|
token_text_pair temp; |
|
temp.token = token; |
|
temp.type = type; |
|
buffer.enqueue(temp); |
|
} |
|
|
|
void buffer_token( |
|
int type, |
|
char token |
|
) |
|
|
|
|
|
|
|
|
|
{ |
|
token_text_pair temp; |
|
temp.token = token; |
|
temp.type = type; |
|
buffer.enqueue(temp); |
|
} |
|
|
|
|
|
cpp_tokenizer_kernel_1(const cpp_tokenizer_kernel_1<tok,queue,set>&); |
|
cpp_tokenizer_kernel_1<tok,queue,set>& operator=(const cpp_tokenizer_kernel_1<tok,queue,set>&); |
|
|
|
|
|
set keywords; |
|
queue buffer; |
|
tok tokenizer; |
|
|
|
mutable std::string next_token; |
|
mutable int next_type; |
|
mutable bool have_peeked; |
|
|
|
|
|
}; |
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
inline void swap ( |
|
cpp_tokenizer_kernel_1<tok,queue,set>& a, |
|
cpp_tokenizer_kernel_1<tok,queue,set>& b |
|
) { a.swap(b); } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
cpp_tokenizer_kernel_1( |
|
) : |
|
have_peeked(false) |
|
{ |
|
|
|
std::string temp; |
|
temp = "#include"; keywords.add(temp); |
|
temp = "__asm"; keywords.add(temp); |
|
temp = "_asm"; keywords.add(temp); |
|
temp = "if"; keywords.add(temp); |
|
temp = "int"; keywords.add(temp); |
|
temp = "else"; keywords.add(temp); |
|
temp = "template"; keywords.add(temp); |
|
temp = "void"; keywords.add(temp); |
|
temp = "false"; keywords.add(temp); |
|
temp = "class"; keywords.add(temp); |
|
temp = "public"; keywords.add(temp); |
|
temp = "while"; keywords.add(temp); |
|
temp = "bool"; keywords.add(temp); |
|
temp = "new"; keywords.add(temp); |
|
temp = "delete"; keywords.add(temp); |
|
temp = "true"; keywords.add(temp); |
|
temp = "typedef"; keywords.add(temp); |
|
temp = "const"; keywords.add(temp); |
|
temp = "virtual"; keywords.add(temp); |
|
temp = "inline"; keywords.add(temp); |
|
temp = "for"; keywords.add(temp); |
|
temp = "break"; keywords.add(temp); |
|
temp = "struct"; keywords.add(temp); |
|
temp = "float"; keywords.add(temp); |
|
temp = "case"; keywords.add(temp); |
|
temp = "enum"; keywords.add(temp); |
|
temp = "this"; keywords.add(temp); |
|
temp = "typeid"; keywords.add(temp); |
|
temp = "double"; keywords.add(temp); |
|
temp = "char"; keywords.add(temp); |
|
temp = "typename"; keywords.add(temp); |
|
temp = "signed"; keywords.add(temp); |
|
temp = "friend"; keywords.add(temp); |
|
temp = "wint_t"; keywords.add(temp); |
|
temp = "default"; keywords.add(temp); |
|
temp = "asm"; keywords.add(temp); |
|
temp = "reinterpret_cast"; keywords.add(temp); |
|
temp = "#define"; keywords.add(temp); |
|
temp = "do"; keywords.add(temp); |
|
temp = "continue"; keywords.add(temp); |
|
temp = "auto"; keywords.add(temp); |
|
temp = "unsigned"; keywords.add(temp); |
|
temp = "size_t"; keywords.add(temp); |
|
temp = "#undef"; keywords.add(temp); |
|
temp = "#pragma"; keywords.add(temp); |
|
temp = "namespace"; keywords.add(temp); |
|
temp = "private"; keywords.add(temp); |
|
temp = "#endif"; keywords.add(temp); |
|
temp = "catch"; keywords.add(temp); |
|
temp = "#else"; keywords.add(temp); |
|
temp = "register"; keywords.add(temp); |
|
temp = "volatile"; keywords.add(temp); |
|
temp = "const_cast"; keywords.add(temp); |
|
temp = "#end"; keywords.add(temp); |
|
temp = "mutable"; keywords.add(temp); |
|
temp = "static_cast"; keywords.add(temp); |
|
temp = "wchar_t"; keywords.add(temp); |
|
temp = "#if"; keywords.add(temp); |
|
temp = "protected"; keywords.add(temp); |
|
temp = "throw"; keywords.add(temp); |
|
temp = "using"; keywords.add(temp); |
|
temp = "dynamic_cast"; keywords.add(temp); |
|
temp = "#ifdef"; keywords.add(temp); |
|
temp = "return"; keywords.add(temp); |
|
temp = "short"; keywords.add(temp); |
|
temp = "#error"; keywords.add(temp); |
|
temp = "#line"; keywords.add(temp); |
|
temp = "explicit"; keywords.add(temp); |
|
temp = "union"; keywords.add(temp); |
|
temp = "#ifndef"; keywords.add(temp); |
|
temp = "try"; keywords.add(temp); |
|
temp = "sizeof"; keywords.add(temp); |
|
temp = "goto"; keywords.add(temp); |
|
temp = "long"; keywords.add(temp); |
|
temp = "#elif"; keywords.add(temp); |
|
temp = "static"; keywords.add(temp); |
|
temp = "operator"; keywords.add(temp); |
|
temp = "switch"; keywords.add(temp); |
|
temp = "extern"; keywords.add(temp); |
|
|
|
|
|
|
|
tokenizer.set_identifier_token( |
|
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(), |
|
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() + |
|
tokenizer.numbers() |
|
); |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
~cpp_tokenizer_kernel_1 ( |
|
) |
|
{ |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
void cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
clear( |
|
) |
|
{ |
|
tokenizer.clear(); |
|
buffer.clear(); |
|
have_peeked = false; |
|
|
|
|
|
tokenizer.set_identifier_token( |
|
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(), |
|
"$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() + |
|
tokenizer.numbers() |
|
); |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
void cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
set_stream ( |
|
std::istream& in |
|
) |
|
{ |
|
tokenizer.set_stream(in); |
|
buffer.clear(); |
|
have_peeked = false; |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
bool cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
stream_is_set ( |
|
) const |
|
{ |
|
return tokenizer.stream_is_set(); |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
std::istream& cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
get_stream ( |
|
) const |
|
{ |
|
return tokenizer.get_stream(); |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
void cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
get_token ( |
|
int& type, |
|
std::string& token |
|
) |
|
{ |
|
using namespace std; |
|
|
|
if (!have_peeked) |
|
{ |
|
|
|
if (buffer.size() > 0) |
|
{ |
|
|
|
token_text_pair temp; |
|
buffer.dequeue(temp); |
|
type = temp.type; |
|
token = temp.token; |
|
return; |
|
} |
|
|
|
tokenizer.get_token(type,token); |
|
|
|
switch (type) |
|
{ |
|
case tok::END_OF_FILE: |
|
{ |
|
type = END_OF_FILE; |
|
} break; |
|
|
|
case tok::END_OF_LINE: |
|
case tok::WHITE_SPACE: |
|
{ |
|
type = tokenizer.peek_type(); |
|
if (type == tok::END_OF_LINE || type == tok::WHITE_SPACE) |
|
{ |
|
std::string temp; |
|
do |
|
{ |
|
tokenizer.get_token(type,temp); |
|
token += temp; |
|
type = tokenizer.peek_type(); |
|
}while (type == tok::END_OF_LINE || type == tok::WHITE_SPACE); |
|
} |
|
type = WHITE_SPACE; |
|
|
|
} break; |
|
|
|
case tok::NUMBER: |
|
{ |
|
|
|
if (tokenizer.peek_type() == tok::IDENTIFIER && token == "0" && |
|
(tokenizer.peek_token()[0] == 'x' || tokenizer.peek_token()[0] == 'X')) |
|
{ |
|
|
|
|
|
std::string temp; |
|
tokenizer.get_token(type,temp); |
|
token = "0" + temp; |
|
|
|
|
|
while (tokenizer.peek_type() == tok::IDENTIFIER || |
|
tokenizer.peek_type() == tok::NUMBER |
|
) |
|
{ |
|
tokenizer.get_token(type,temp); |
|
token += temp; |
|
} |
|
|
|
} |
|
|
|
else if ((tokenizer.peek_type() == tok::CHAR && tokenizer.peek_token()[0] == '.') || |
|
(tokenizer.peek_type() == tok::IDENTIFIER && std::tolower(tokenizer.peek_token()[0]) == 'e')) |
|
{ |
|
std::string temp; |
|
tokenizer.get_token(type,temp); |
|
token += temp; |
|
|
|
while (tokenizer.peek_type() == tok::IDENTIFIER || |
|
tokenizer.peek_type() == tok::NUMBER |
|
) |
|
{ |
|
tokenizer.get_token(type,temp); |
|
token += temp; |
|
} |
|
} |
|
type = NUMBER; |
|
|
|
} break; |
|
|
|
case tok::IDENTIFIER: |
|
{ |
|
if (keywords.is_member(token)) |
|
{ |
|
type = KEYWORD; |
|
} |
|
else |
|
{ |
|
type = IDENTIFIER; |
|
} |
|
} break; |
|
|
|
case tok::CHAR: |
|
type = OTHER; |
|
switch (token[0]) |
|
{ |
|
case '#': |
|
{ |
|
|
|
|
|
if (tokenizer.peek_type() == tok::IDENTIFIER && |
|
keywords.is_member('#'+tokenizer.peek_token())) |
|
{ |
|
tokenizer.get_token(type,token); |
|
token = '#' + token; |
|
type = KEYWORD; |
|
} |
|
else |
|
{ |
|
token = '#'; |
|
type = OTHER; |
|
} |
|
} |
|
break; |
|
|
|
case '"': |
|
{ |
|
string temp; |
|
tokenizer.get_token(type,token); |
|
while (type != tok::END_OF_FILE) |
|
{ |
|
|
|
if (type == tok::CHAR && token[0] == '"' && |
|
(temp.size() == 0 || temp[temp.size()-1] != '\\' || |
|
(temp.size() > 1 && temp[temp.size()-2] == '\\') )) |
|
{ |
|
buffer_token(DOUBLE_QUOTED_TEXT,temp); |
|
buffer_token(OTHER,"\""); |
|
break; |
|
} |
|
else |
|
{ |
|
temp += token; |
|
} |
|
tokenizer.get_token(type,token); |
|
} |
|
|
|
|
|
type = OTHER; |
|
token = '"'; |
|
} break; |
|
|
|
case '\'': |
|
{ |
|
string temp; |
|
tokenizer.get_token(type,token); |
|
if (type == tok::CHAR && token[0] == '\\') |
|
{ |
|
temp += '\\'; |
|
tokenizer.get_token(type,token); |
|
} |
|
temp += token; |
|
buffer_token(SINGLE_QUOTED_TEXT,temp); |
|
|
|
|
|
|
|
tokenizer.get_token(type,token); |
|
buffer_token(OTHER,token); |
|
|
|
type = OTHER; |
|
token = '\''; |
|
} break; |
|
|
|
case '/': |
|
{ |
|
|
|
if (tokenizer.peek_type() == tok::CHAR) |
|
{ |
|
if (tokenizer.peek_token()[0] == '/') |
|
{ |
|
tokenizer.get_token(type,token); |
|
|
|
token = "//"; |
|
string temp; |
|
tokenizer.get_token(type,temp); |
|
while (type != tok::END_OF_FILE) |
|
{ |
|
|
|
if (type == tok::END_OF_LINE && |
|
token[token.size()-1] != '\\' ) |
|
{ |
|
token += '\n'; |
|
break; |
|
} |
|
else |
|
{ |
|
token += temp; |
|
} |
|
tokenizer.get_token(type,temp); |
|
} |
|
type = COMMENT; |
|
|
|
} |
|
else if (tokenizer.peek_token()[0] == '*') |
|
{ |
|
tokenizer.get_token(type,token); |
|
|
|
token = "/*"; |
|
string temp; |
|
tokenizer.get_token(type,temp); |
|
while (type != tok::END_OF_FILE) |
|
{ |
|
|
|
if (type == tok::CHAR && temp[0] == '/' && |
|
token[token.size()-1] == '*') |
|
{ |
|
token += '/'; |
|
break; |
|
} |
|
else |
|
{ |
|
token += temp; |
|
} |
|
tokenizer.get_token(type,temp); |
|
} |
|
type = COMMENT; |
|
} |
|
} |
|
} break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
|
|
|
|
type = next_type; |
|
token = next_token; |
|
have_peeked = false; |
|
} |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
int cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
peek_type ( |
|
) const |
|
{ |
|
const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token); |
|
have_peeked = true; |
|
return next_type; |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
const std::string& cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
peek_token ( |
|
) const |
|
{ |
|
const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token); |
|
have_peeked = true; |
|
return next_token; |
|
} |
|
|
|
|
|
|
|
template < |
|
typename tok, |
|
typename queue, |
|
typename set |
|
> |
|
void cpp_tokenizer_kernel_1<tok,queue,set>:: |
|
swap ( |
|
cpp_tokenizer_kernel_1& item |
|
) |
|
{ |
|
tokenizer.swap(item.tokenizer); |
|
buffer.swap(item.buffer); |
|
} |
|
|
|
|
|
|
|
} |
|
|
|
#endif |
|
|
|
|