flash / Mikobot /plugins /helper_funcs /string_handling.py
Karma
Add files via upload
c7dfe8b
# <============================================== IMPORTS =========================================================>
import re
import time
from typing import Dict, List
import bleach
import markdown2
from emoji import unicode_codes
from telegram import MessageEntity
from telegram.helpers import escape_markdown
# <=======================================================================================================>
MATCH_MD = re.compile(
r"\*(.*?)\*|"
r"_(.*?)_|"
r"`(.*?)`|"
r"(?<!\\)(\[.*?\])(\(.*?\))|"
r"(?P<esc>[*_`\[])",
)
LINK_REGEX = re.compile(r"(?<!\\)\[.+?\]\((.*?)\)")
BTN_URL_REGEX = re.compile(r"(\[([^\[]+?)\]\(buttonurl:(?:/{0,2})(.+?)(:same)?\))")
_EMOJI_REGEXP = None
# <================================================ FUNCTION =======================================================>
def get_emoji_regexp():
global _EMOJI_REGEXP
if _EMOJI_REGEXP is None:
emojis = sorted(unicode_codes.EMOJI_DATA, key=len, reverse=True)
pattern = "(" + "|".join(re.escape(u) for u in emojis) + ")"
return re.compile(pattern)
def _selective_escape(to_parse: str) -> str:
"""
Escape all invalid markdown
:param to_parse: text to escape
:return: valid markdown string
"""
offset = 0 # offset to be used as adding a \ character causes the string to shift
for match in MATCH_MD.finditer(to_parse):
if match.group("esc"):
ent_start = match.start()
to_parse = (
to_parse[: ent_start + offset] + "\\" + to_parse[ent_start + offset :]
)
offset += 1
return to_parse
# This is a fun one.
def _calc_emoji_offset(to_calc) -> int:
# Get all emoji in text.
emoticons = get_emoji_regexp().finditer(to_calc)
# Check the utf16 length of the emoji to determine the offset it caused.
# Normal, 1 character emoji don't affect; hence sub 1.
# special, eg with two emoji characters (eg face, and skin col) will have length 2, so by subbing one we
# know we'll get one extra offset,
return sum(len(e.group(0).encode("utf-16-le")) // 2 - 1 for e in emoticons)
def markdown_parser(
txt: str,
entities: Dict[MessageEntity, str] = None,
offset: int = 0,
) -> str:
"""
Parse a string, escaping all invalid markdown entities.
Escapes URL's so as to avoid URL mangling.
Re-adds any telegram code entities obtained from the entities object.
:param txt: text to parse
:param entities: dict of message entities in text
:param offset: message offset - command and notename length
:return: valid markdown string
"""
if not entities:
entities = {}
if not txt:
return ""
prev = 0
res = ""
# Loop over all message entities, and:
# reinsert code
# escape free-standing urls
for ent, ent_text in entities.items():
if ent.offset < -offset:
continue
start = ent.offset + offset # start of entity
end = ent.offset + offset + ent.length - 1 # end of entity
# we only care about code, url, text links
if ent.type in ("code", "url", "text_link", "spoiler"):
# count emoji to switch counter
count = _calc_emoji_offset(txt[:start])
start -= count
end -= count
# URL handling -> do not escape if in [](), escape otherwise.
if ent.type == "url":
if any(
match.start(1) <= start and end <= match.end(1)
for match in LINK_REGEX.finditer(txt)
):
continue
# else, check the escapes between the prev and last and forcefully escape the url to avoid mangling
else:
# TODO: investigate possible offset bug when lots of emoji are present
res += _selective_escape(txt[prev:start] or "") + escape_markdown(
ent_text, 2
)
# code handling
elif ent.type == "code":
res += _selective_escape(txt[prev:start]) + "`" + ent_text + "`"
# handle markdown/html links
elif ent.type == "text_link":
res += _selective_escape(txt[prev:start]) + "[{}]({})".format(
ent_text,
ent.url,
)
# handle spoiler
elif ent.type == "spoiler":
res += _selective_escape(txt[prev:start]) + "||" + ent_text + "||"
end += 1
# anything else
else:
continue
prev = end
res += _selective_escape(txt[prev:]) # add the rest of the text
return res
def button_markdown_parser(
txt: str,
entities: Dict[MessageEntity, str] = None,
offset: int = 0,
) -> (str, List):
markdown_note = markdown_parser(txt, entities, offset)
prev = 0
note_data = ""
buttons = []
for match in BTN_URL_REGEX.finditer(markdown_note):
# Check if btnurl is escaped
n_escapes = 0
to_check = match.start(1) - 1
while to_check > 0 and markdown_note[to_check] == "\\":
n_escapes += 1
to_check -= 1
# if even, not escaped -> create button
if n_escapes % 2 == 0:
# create a thruple with button label, url, and newline status
buttons.append((match.group(2), match.group(3), bool(match.group(4))))
note_data += markdown_note[prev : match.start(1)]
prev = match.end(1)
# if odd, escaped -> move along
else:
note_data += markdown_note[prev:to_check]
prev = match.start(1) - 1
else:
note_data += markdown_note[prev:]
return note_data, buttons
def escape_invalid_curly_brackets(text: str, valids: List[str]) -> str:
new_text = ""
idx = 0
while idx < len(text):
if text[idx] == "{":
if idx + 1 < len(text) and text[idx + 1] == "{":
idx += 2
new_text += "{{{{"
continue
else:
success = False
for v in valids:
if text[idx:].startswith("{" + v + "}"):
success = True
break
if success:
new_text += text[idx : idx + len(v) + 2]
idx += len(v) + 2
continue
else:
new_text += "{{"
elif text[idx] == "}":
if idx + 1 < len(text) and text[idx + 1] == "}":
idx += 2
new_text += "}}}}"
continue
else:
new_text += "}}"
else:
new_text += text[idx]
idx += 1
return new_text
SMART_OPEN = "β€œ"
SMART_CLOSE = "”"
START_CHAR = ("'", '"', SMART_OPEN)
def split_quotes(text: str) -> List:
if not any(text.startswith(char) for char in START_CHAR):
return text.split(None, 1)
counter = 1 # ignore first char -> is some kind of quote
while counter < len(text):
if text[counter] == "\\":
counter += 1
elif text[counter] == text[0] or (
text[0] == SMART_OPEN and text[counter] == SMART_CLOSE
):
break
counter += 1
else:
return text.split(None, 1)
# 1 to avoid starting quote, and counter is exclusive so avoids ending
key = remove_escapes(text[1:counter].strip())
# index will be in range, or `else` would have been executed and returned
rest = text[counter + 1 :].strip()
if not key:
key = text[0] + text[0]
return list(filter(None, [key, rest]))
def remove_escapes(text: str) -> str:
res = ""
is_escaped = False
for counter in range(len(text)):
if is_escaped:
res += text[counter]
is_escaped = False
elif text[counter] == "\\":
is_escaped = True
else:
res += text[counter]
return res
def escape_chars(text: str, to_escape: List[str]) -> str:
to_escape.append("\\")
new_text = ""
for x in text:
if x in to_escape:
new_text += "\\"
new_text += x
return new_text
async def extract_time(message, time_val):
if any(time_val.endswith(unit) for unit in ("m", "h", "d")):
unit = time_val[-1]
time_num = time_val[:-1] # type: str
if not time_num.isdigit():
await message.reply_text("Invalid time amount specified.")
return ""
if unit == "m":
bantime = int(time.time() + int(time_num) * 60)
elif unit == "h":
bantime = int(time.time() + int(time_num) * 60 * 60)
elif unit == "d":
bantime = int(time.time() + int(time_num) * 24 * 60 * 60)
else:
# how even...?
return ""
return bantime
else:
await message.reply_text(
"Invalid time type specified. Expected m,h, or d, got: {}".format(
time_val[-1],
),
)
return ""
def markdown_to_html(text: str):
text = text.replace("*", "**")
text = text.replace("`", "```")
text = text.replace("~", "~~")
spoiler_pattern = re.compile(r"\|\|(?=\S)(.+?)(?<=\S)\|\|", re.S)
text = spoiler_pattern.sub(r"<tg-spoiler>\1</tg-spoiler>", text)
_html = markdown2.markdown(text, extras=["strike", "underline"])
return bleach.clean(
_html,
tags=["strong", "em", "a", "code", "pre", "strike", "u", "tg-spoiler"],
strip=True,
)[:-1]
# <================================================ END =======================================================>