Backup-bdg's picture
Upload 964 files
51ff9e5 verified
import re
from pygments.lexers.python import PythonLexer
def tokenize_code(code):
lexer = PythonLexer()
tokens = process_pygments_tokens(lexer.get_tokens(code))
return tokens
def process_pygments_tokens(tokens):
new_tokens = []
for token in tokens:
if (
str(token[0]) == 'Token.Text'
and re.match(r'\s+', token[1])
or str(token[0]) == 'Token.Text.Whitespace'
):
continue
new_tokens.append(token[1])
new_tokens_final = []
i = 0
while i < len(new_tokens) - 2:
if (
new_tokens[i] == '"'
and new_tokens[i + 1] == 'STR'
and new_tokens[i + 2] == '"'
):
new_tokens_final.append('"STR"')
i = i + 3
else:
new_tokens_final.append(new_tokens[i])
i = i + 1
for i in range(len(new_tokens) - 2, len(new_tokens)):
if i >= 0:
new_tokens_final.append(new_tokens[i])
return new_tokens_final