Spaces:

qinfeng722
/

llm-studio

Running

File size: 3,856 Bytes

5caedb4

import html
import re
from dataclasses import dataclass
from typing import List

PLOT_ENCODINGS = ["image", "html", "df"]


@dataclass
class PlotData:
    """
    Data to plot.

    Args:
        data: the data to plot:
            - a base64 encoded PNG if `encoding` is `png`.
            - a string in HTML if `encoding` is `html`.
            - a path to a parquet file if `encoding` is `df`.
        encoding: the encoding of the data, one of PLOT_ENCODINGS.
    """

    data: str
    encoding: str

    def __post_init__(self):
        assert self.encoding in PLOT_ENCODINGS, f"Unknown plot encoding {self.encoding}"


def get_line_separator_html():
    return (
        "<div style='height: 1px; width: 100%; margin: 1em 0; "
        "background-color: white; background-color: var(--text);'></div>"
    )


def decode_bytes(chunks: List[bytes]):
    """Decodes bytes to string

    Args:
        chunks: byte chunks

    Returns:
        list of decoded strings
    """
    decoded_tokens = []
    buffer = b""

    for chunk in chunks:
        combined = buffer + chunk
        try:
            # Try to decode the combined bytes
            decoded_tokens.append(combined.decode("utf-8"))
            # If successful, clear the buffer
            buffer = b""
        except UnicodeDecodeError:
            # If decoding failed, keep the current chunk in the buffer
            # and attempt to combine it with the next chunk
            buffer = chunk

    # Attempt to decode any remaining bytes in the buffer
    try:
        decoded_tokens.append(buffer.decode("utf-8"))
    except UnicodeDecodeError:
        pass

    return decoded_tokens


def format_for_markdown_visualization(text: str) -> str:
    """
    Convert newlines to <br /> tags, except for those inside code blocks.
    This is needed because the markdown_table_cell_type() function does not
    convert newlines to <br /> tags, so we have to do it ourselves.

    This function is rather simple and may fail on text that uses `
    in some other context than marking code cells or uses ` within
    the code itself (as this function).
    """
    code_block_regex = r"(```.*?```|``.*?``)"
    parts = re.split(code_block_regex, text, flags=re.DOTALL)
    for i in range(len(parts)):
        # Only substitute for text outside matched code blocks
        if "`" not in parts[i]:
            parts[i] = parts[i].replace("\n", "<br />").strip()
    text = "".join(parts)

    # Restore newlines around code blocks, needed for correct rendering
    for x in ["```", "``", "`"]:
        text = text.replace(f"<br />{x}", f"\n{x}")
        text = text.replace(f"{x}<br />", f"{x}\n")
    return html.escape(text.replace("<br />", "\n"))


def list_to_markdown_representation(
    tokens: List[str], masks: List[bool], pad_token: int, num_chars: int = 65
):
    """
    Creates a markdown representation string from a list of tokens,
    with HTML line breaks after 'num_chars' characters.
    Masked tokens will be emphasized in HTML representation.

    """
    x = []
    sublist: List[str] = []
    raw_sublist: List[str] = []
    for token, mask in zip(tokens, masks):
        if len(token) + len(", ".join(raw_sublist)) > num_chars:
            x.append(", ".join(sublist))
            sublist = []
            raw_sublist = []

        raw_sublist.append(token)
        token_formatted = html.escape(token)
        if mask:
            token_formatted = f"""***{token_formatted}***"""
        elif token == pad_token:
            token_formatted = f"""<span style="color: rgba(70, 70, 70, 0.5);">{
            token_formatted
            }</span>"""
        sublist.append(token_formatted)

    if sublist:  # add any remaining items in sublist
        x.append(", ".join(sublist))

    list_representation = "\n[" + "<br />".join(x) + "]\n"
    return list_representation