Spaces:
Runtime error
Runtime error
import html.entities | |
from typing import Dict, List, Optional | |
from . import config | |
unifiable_n = { | |
html.entities.name2codepoint[k]: v | |
for k, v in config.UNIFIABLE.items() | |
if k != "nbsp" | |
} | |
def hn(tag: str) -> int: | |
if tag[0] == "h" and len(tag) == 2: | |
n = tag[1] | |
if "0" < n <= "9": | |
return int(n) | |
return 0 | |
def dumb_property_dict(style: str) -> Dict[str, str]: | |
""" | |
:returns: A hash of css attributes | |
""" | |
return { | |
x.strip().lower(): y.strip().lower() | |
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z] | |
} | |
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]: | |
""" | |
:type data: str | |
:returns: A hash of css selectors, each of which contains a hash of | |
css attributes. | |
:rtype: dict | |
""" | |
# remove @import sentences | |
data += ";" | |
importIndex = data.find("@import") | |
while importIndex != -1: | |
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :] | |
importIndex = data.find("@import") | |
# parse the css. reverted from dictionary comprehension in order to | |
# support older pythons | |
pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()] | |
try: | |
elements = {a.strip(): dumb_property_dict(b) for a, b in pairs} | |
except ValueError: | |
elements = {} # not that important | |
return elements | |
def element_style( | |
attrs: Dict[str, Optional[str]], | |
style_def: Dict[str, Dict[str, str]], | |
parent_style: Dict[str, str], | |
) -> Dict[str, str]: | |
""" | |
:type attrs: dict | |
:type style_def: dict | |
:type style_def: dict | |
:returns: A hash of the 'final' style attributes of the element | |
:rtype: dict | |
""" | |
style = parent_style.copy() | |
if "class" in attrs: | |
assert attrs["class"] is not None | |
for css_class in attrs["class"].split(): | |
css_style = style_def.get("." + css_class, {}) | |
style.update(css_style) | |
if "style" in attrs: | |
assert attrs["style"] is not None | |
immediate_style = dumb_property_dict(attrs["style"]) | |
style.update(immediate_style) | |
return style | |
def google_list_style(style: Dict[str, str]) -> str: | |
""" | |
Finds out whether this is an ordered or unordered list | |
:type style: dict | |
:rtype: str | |
""" | |
if "list-style-type" in style: | |
list_style = style["list-style-type"] | |
if list_style in ["disc", "circle", "square", "none"]: | |
return "ul" | |
return "ol" | |
def google_has_height(style: Dict[str, str]) -> bool: | |
""" | |
Check if the style of the element has the 'height' attribute | |
explicitly defined | |
:type style: dict | |
:rtype: bool | |
""" | |
return "height" in style | |
def google_text_emphasis(style: Dict[str, str]) -> List[str]: | |
""" | |
:type style: dict | |
:returns: A list of all emphasis modifiers of the element | |
:rtype: list | |
""" | |
emphasis = [] | |
if "text-decoration" in style: | |
emphasis.append(style["text-decoration"]) | |
if "font-style" in style: | |
emphasis.append(style["font-style"]) | |
if "font-weight" in style: | |
emphasis.append(style["font-weight"]) | |
return emphasis | |
def google_fixed_width_font(style: Dict[str, str]) -> bool: | |
""" | |
Check if the css of the current element defines a fixed width font | |
:type style: dict | |
:rtype: bool | |
""" | |
font_family = "" | |
if "font-family" in style: | |
font_family = style["font-family"] | |
return "courier new" == font_family or "consolas" == font_family | |
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int: | |
""" | |
Extract numbering from list element attributes | |
:type attrs: dict | |
:rtype: int or None | |
""" | |
if "start" in attrs: | |
assert attrs["start"] is not None | |
try: | |
return int(attrs["start"]) - 1 | |
except ValueError: | |
pass | |
return 0 | |
def skipwrap( | |
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool | |
) -> bool: | |
# If it appears to contain a link | |
# don't wrap | |
if not wrap_links and config.RE_LINK.search(para): | |
return True | |
# If the text begins with four spaces or one tab, it's a code block; | |
# don't wrap | |
if para[0:4] == " " or para[0] == "\t": | |
return True | |
# If the text begins with only two "--", possibly preceded by | |
# whitespace, that's an emdash; so wrap. | |
stripped = para.lstrip() | |
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": | |
return False | |
# I'm not sure what this is for; I thought it was to detect lists, | |
# but there's a <br>-inside-<span> case in one of the tests that | |
# also depends upon it. | |
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": | |
return not wrap_list_items | |
# If text contains a pipe character it is likely a table | |
if not wrap_tables and config.RE_TABLE.search(para): | |
return True | |
# If the text begins with a single -, *, or +, followed by a space, | |
# or an integer, followed by a ., followed by a space (in either | |
# case optionally proceeded by whitespace), it's a list; don't wrap. | |
return bool( | |
config.RE_ORDERED_LIST_MATCHER.match(stripped) | |
or config.RE_UNORDERED_LIST_MATCHER.match(stripped) | |
) | |
def escape_md(text: str) -> str: | |
""" | |
Escapes markdown-sensitive characters within other markdown | |
constructs. | |
""" | |
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) | |
def escape_md_section( | |
text: str, | |
escape_backslash: bool = True, | |
snob: bool = False, | |
escape_dot: bool = True, | |
escape_plus: bool = True, | |
escape_dash: bool = True | |
) -> str: | |
""" | |
Escapes markdown-sensitive characters across whole document sections. | |
Each escaping operation can be controlled individually. | |
""" | |
if escape_backslash: | |
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) | |
if snob: | |
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) | |
if escape_dot: | |
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) | |
if escape_plus: | |
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) | |
if escape_dash: | |
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) | |
return text | |
def reformat_table(lines: List[str], right_margin: int) -> List[str]: | |
""" | |
Given the lines of a table | |
padds the cells and returns the new lines | |
""" | |
# find the maximum width of the columns | |
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] | |
max_cols = len(max_width) | |
for line in lines: | |
cols = [x.rstrip() for x in line.split("|")] | |
num_cols = len(cols) | |
# don't drop any data if colspan attributes result in unequal lengths | |
if num_cols < max_cols: | |
cols += [""] * (max_cols - num_cols) | |
elif max_cols < num_cols: | |
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] | |
max_cols = num_cols | |
max_width = [ | |
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) | |
] | |
# reformat | |
new_lines = [] | |
for line in lines: | |
cols = [x.rstrip() for x in line.split("|")] | |
if set(line.strip()) == set("-|"): | |
filler = "-" | |
new_cols = [ | |
x.rstrip() + (filler * (M - len(x.rstrip()))) | |
for x, M in zip(cols, max_width) | |
] | |
new_lines.append("|-" + "|".join(new_cols) + "|") | |
else: | |
filler = " " | |
new_cols = [ | |
x.rstrip() + (filler * (M - len(x.rstrip()))) | |
for x, M in zip(cols, max_width) | |
] | |
new_lines.append("| " + "|".join(new_cols) + "|") | |
return new_lines | |
def pad_tables_in_text(text: str, right_margin: int = 1) -> str: | |
""" | |
Provide padding for tables in the text | |
""" | |
lines = text.split("\n") | |
table_buffer = [] # type: List[str] | |
table_started = False | |
new_lines = [] | |
for line in lines: | |
# Toggle table started | |
if config.TABLE_MARKER_FOR_PAD in line: | |
table_started = not table_started | |
if not table_started: | |
table = reformat_table(table_buffer, right_margin) | |
new_lines.extend(table) | |
table_buffer = [] | |
new_lines.append("") | |
continue | |
# Process lines | |
if table_started: | |
table_buffer.append(line) | |
else: | |
new_lines.append(line) | |
return "\n".join(new_lines) | |