Hello World

import gradio as gr
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.toc import TocExtension
from markdown.extensions.attr_list import AttrListExtension
from markdown.extensions.codehilite import CodeHiliteExtension

# For ReaderLM-2
from transformers import pipeline

# For ReaderLM-1
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify

######################################
# 1) MARKDOWN-STUDIO FUNCTIONALITY   #
######################################
def render_markdown(md_text):
    """
    Render a string of Markdown text into HTML with a number of useful extensions.
    """
    return markdown.markdown(
        md_text,
        extensions=[
            TableExtension(),
            FencedCodeExtension(),
            TocExtension(baselevel=2),
            AttrListExtension(),
            CodeHiliteExtension(linenums=False, css_class="highlight"),
        ],
    )

######################################
# 2) READERLM-2 FUNCTIONALITY        #
######################################
# Load the JinaAI ReaderLM-v2 model
model_name = "jinaai/ReaderLM-v2"
html_converter = pipeline("text-generation", model=model_name)

def convert_html(html_input, output_format):
    """
    Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
    """
    prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
    response = html_converter(prompt, max_length=500, num_return_sequences=1)
    converted_output = response[0]['generated_text']
    
    # Remove the prompt from the start of the generated text, if present
    converted_output = converted_output.replace(prompt, "").strip()
    return converted_output

######################################
# 3) READERLM-1 FUNCTIONALITY        #
######################################
# Prepare models and tokenizers
models = {
    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
        "jinaai/reader-lm-0.5b", trust_remote_code=True
    ).eval().to("cuda"),
    "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
        "jinaai/reader-lm-1.5b", trust_remote_code=True
    ).eval().to("cuda"),
}
tokenizers = {
    "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
        "jinaai/reader-lm-0.5b", trust_remote_code=True
    ),
    "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
        "jinaai/reader-lm-1.5b", trust_remote_code=True
    ),
}

@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
    """
    Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
    then also provide a rule-based 'markdownify' output.
    """
    model = models[model_id]
    tokenizer = tokenizers[model_id]

    # Construct the chat-based input
    messages = [{"role": "user", "content": html_content}]
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)

    # Tokenize
    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    
    # Generate
    outputs = model.generate(
        inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
    )

    # Extract the model's text from the response
    pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
    assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)

    # Also do a rule-based markdownify for comparison
    markdownify_output = markdownify(html_content)

    # Return the two results (model-based, rule-based)
    return assistant_response[0], markdownify_output

# Example HTML from ReaderLM-1
example_html = """<div id="myDIV" class="header">
  <h2>My To Do List</h2>
  <input type="text" id="myInput" placeholder="Title...">
  <span onclick="newElement()" class="addBtn">Add</span>
</div>

<ul id="myUL">
  <li>Hit the gym</li>
  <li class="checked">Pay bills</li>
  <li>Meet George</li>
  <li>Buy eggs</li>
  <li>Read a book</li>
  <li>Organize office</li>
</ul>"""

########################################################
# Combine everything into a single Gradio Blocks app   #
########################################################

# Optional extra CSS for the ReaderLM-1 tab
css = """
#output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
}
"""

# We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:

    ########################################################
    # TAB 1: Markdown Suite (live preview)
    ########################################################
    with gr.Tab("Live Preview"):
        gr.Markdown("# Markdown Suite")

        with gr.Row():
            with gr.Column():
                md_input = gr.Textbox(
                    lines=20,
                    placeholder="Write your markdown here...",
                    label="Markdown Input",
                )
            with gr.Column():
                md_output = gr.HTML(
                    label="Rendered Output"
                )

        md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)

    ########################################################
    # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
    ########################################################
    with gr.Tab("ReaderLM-2 Converter"):
        gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")

        with gr.Row():
            html_input_2 = gr.Textbox(
                lines=10,
                placeholder="Paste your raw HTML here...",
                label="Raw HTML Input"
            )
            output_format_2 = gr.Radio(
                ["Markdown", "JSON"],
                label="Output Format",
                value="Markdown"
            )

        convert_btn_2 = gr.Button("Convert")
        converted_output_2 = gr.Textbox(
            lines=10,
            label="Converted Output"
        )

        # Provide usage details
        gr.Markdown(
            "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
        )

        # Button event: calls convert_html
        convert_btn_2.click(
            fn=convert_html,
            inputs=[html_input_2, output_format_2],
            outputs=converted_output_2
        )

        # Examples
        gr.Examples(
            examples=[
                ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
                ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
            ],
            inputs=[html_input_2, output_format_2],
            outputs=converted_output_2,
            fn=convert_html,
            cache_examples=False
        )

    ########################################################
    # TAB 3: ReaderLM-1 HTML-to-Markdown
    ########################################################
    with gr.Tab("ReaderLM-1 Converter"):
        gr.Markdown("""
        # HTML-to-Markdown with ReaderLM-1
        Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b** 
        to convert HTML to Markdown. Compare against rule-based `markdownify`.
        """)

        with gr.Row():
            with gr.Column():
                model_selector = gr.Dropdown(
                    choices=list(models.keys()),
                    label="Model",
                    value="jinaai/reader-lm-1.5b"
                )
                html_content = gr.Textbox(
                    label="HTML"
                )
                submit_btn = gr.Button(value="Submit")

            with gr.Column():
                model_output_text = gr.Textbox(label="Reader LM Output")
                markdownify_output = gr.Textbox(label="Markdownify Output")

        # Example usage
        gr.Examples(
            examples=[
                [example_html],
            ],
            inputs=[html_content],
            outputs=[model_output_text, markdownify_output],
            fn=run_example,
            cache_examples=True,
            label="Try example HTML"
        )

        # Button event for custom input
        submit_btn.click(
            fn=run_example,
            inputs=[html_content, model_selector],
            outputs=[model_output_text, markdownify_output]
        )

# Finally, launch the combined demo
demo.launch()