Spaces:
Sleeping
Sleeping
File size: 3,856 Bytes
5caedb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import html
import re
from dataclasses import dataclass
from typing import List
PLOT_ENCODINGS = ["image", "html", "df"]
@dataclass
class PlotData:
"""
Data to plot.
Args:
data: the data to plot:
- a base64 encoded PNG if `encoding` is `png`.
- a string in HTML if `encoding` is `html`.
- a path to a parquet file if `encoding` is `df`.
encoding: the encoding of the data, one of PLOT_ENCODINGS.
"""
data: str
encoding: str
def __post_init__(self):
assert self.encoding in PLOT_ENCODINGS, f"Unknown plot encoding {self.encoding}"
def get_line_separator_html():
return (
"<div style='height: 1px; width: 100%; margin: 1em 0; "
"background-color: white; background-color: var(--text);'></div>"
)
def decode_bytes(chunks: List[bytes]):
"""Decodes bytes to string
Args:
chunks: byte chunks
Returns:
list of decoded strings
"""
decoded_tokens = []
buffer = b""
for chunk in chunks:
combined = buffer + chunk
try:
# Try to decode the combined bytes
decoded_tokens.append(combined.decode("utf-8"))
# If successful, clear the buffer
buffer = b""
except UnicodeDecodeError:
# If decoding failed, keep the current chunk in the buffer
# and attempt to combine it with the next chunk
buffer = chunk
# Attempt to decode any remaining bytes in the buffer
try:
decoded_tokens.append(buffer.decode("utf-8"))
except UnicodeDecodeError:
pass
return decoded_tokens
def format_for_markdown_visualization(text: str) -> str:
"""
Convert newlines to <br /> tags, except for those inside code blocks.
This is needed because the markdown_table_cell_type() function does not
convert newlines to <br /> tags, so we have to do it ourselves.
This function is rather simple and may fail on text that uses `
in some other context than marking code cells or uses ` within
the code itself (as this function).
"""
code_block_regex = r"(```.*?```|``.*?``)"
parts = re.split(code_block_regex, text, flags=re.DOTALL)
for i in range(len(parts)):
# Only substitute for text outside matched code blocks
if "`" not in parts[i]:
parts[i] = parts[i].replace("\n", "<br />").strip()
text = "".join(parts)
# Restore newlines around code blocks, needed for correct rendering
for x in ["```", "``", "`"]:
text = text.replace(f"<br />{x}", f"\n{x}")
text = text.replace(f"{x}<br />", f"{x}\n")
return html.escape(text.replace("<br />", "\n"))
def list_to_markdown_representation(
tokens: List[str], masks: List[bool], pad_token: int, num_chars: int = 65
):
"""
Creates a markdown representation string from a list of tokens,
with HTML line breaks after 'num_chars' characters.
Masked tokens will be emphasized in HTML representation.
"""
x = []
sublist: List[str] = []
raw_sublist: List[str] = []
for token, mask in zip(tokens, masks):
if len(token) + len(", ".join(raw_sublist)) > num_chars:
x.append(", ".join(sublist))
sublist = []
raw_sublist = []
raw_sublist.append(token)
token_formatted = html.escape(token)
if mask:
token_formatted = f"""***{token_formatted}***"""
elif token == pad_token:
token_formatted = f"""<span style="color: rgba(70, 70, 70, 0.5);">{
token_formatted
}</span>"""
sublist.append(token_formatted)
if sublist: # add any remaining items in sublist
x.append(", ".join(sublist))
list_representation = "\n[" + "<br />".join(x) + "]\n"
return list_representation
|