import gradio as gr import markdown from markdown.extensions.tables import TableExtension from markdown.extensions.fenced_code import FencedCodeExtension from markdown.extensions.toc import TocExtension from markdown.extensions.attr_list import AttrListExtension from markdown.extensions.codehilite import CodeHiliteExtension # For ReaderLM-2 from transformers import pipeline # For ReaderLM-1 from transformers import AutoTokenizer, AutoModelForCausalLM import spaces import re from markdownify import markdownify ###################################### # 1) MARKDOWN-STUDIO FUNCTIONALITY # ###################################### def render_markdown(md_text): """ Render a string of Markdown text into HTML with a number of useful extensions. """ return markdown.markdown( md_text, extensions=[ TableExtension(), FencedCodeExtension(), TocExtension(baselevel=2), AttrListExtension(), CodeHiliteExtension(linenums=False, css_class="highlight"), ], ) ###################################### # 2) READERLM-2 FUNCTIONALITY # ###################################### # Load the JinaAI ReaderLM-v2 model model_name = "jinaai/ReaderLM-v2" html_converter = pipeline("text-generation", model=model_name) def convert_html(html_input, output_format): """ Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON. """ prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}" response = html_converter(prompt, max_length=500, num_return_sequences=1) converted_output = response[0]['generated_text'] # Remove the prompt from the start of the generated text, if present converted_output = converted_output.replace(prompt, "").strip() return converted_output ###################################### # 3) READERLM-1 FUNCTIONALITY # ###################################### # Prepare models and tokenizers models = { "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ).eval().to("cuda"), "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ).eval().to("cuda"), } tokenizers = { "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ), "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ), } @spaces.GPU def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): """ Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text, then also provide a rule-based 'markdownify' output. """ model = models[model_id] tokenizer = tokenizers[model_id] # Construct the chat-based input messages = [{"role": "user", "content": html_content}] input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Tokenize inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") # Generate outputs = model.generate( inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08 ) # Extract the model's text from the response pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) # Also do a rule-based markdownify for comparison markdownify_output = markdownify(html_content) # Return the two results (model-based, rule-based) return assistant_response[0], markdownify_output # Example HTML from ReaderLM-1 example_html = """

My To Do List

Add
""" ######################################################## # Combine everything into a single Gradio Blocks app # ######################################################## # Optional extra CSS for the ReaderLM-1 tab css = """ #output { height: 500px; overflow: auto; border: 1px solid #ccc; } """ # We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo: ######################################################## # TAB 1: Markdown Suite (live preview) ######################################################## with gr.Tab("Live Preview"): gr.Markdown("# Markdown Suite") with gr.Row(): with gr.Column(): md_input = gr.Textbox( lines=20, placeholder="Write your markdown here...", label="Markdown Input", ) with gr.Column(): md_output = gr.HTML( label="Rendered Output" ) md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output) ######################################################## # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON) ######################################################## with gr.Tab("ReaderLM-2 Converter"): gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") with gr.Row(): html_input_2 = gr.Textbox( lines=10, placeholder="Paste your raw HTML here...", label="Raw HTML Input" ) output_format_2 = gr.Radio( ["Markdown", "JSON"], label="Output Format", value="Markdown" ) convert_btn_2 = gr.Button("Convert") converted_output_2 = gr.Textbox( lines=10, label="Converted Output" ) # Provide usage details gr.Markdown( "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**." ) # Button event: calls convert_html convert_btn_2.click( fn=convert_html, inputs=[html_input_2, output_format_2], outputs=converted_output_2 ) # Examples gr.Examples( examples=[ ["

Hello World

This is a test.

", "Markdown"], ["", "JSON"] ], inputs=[html_input_2, output_format_2], outputs=converted_output_2, fn=convert_html, cache_examples=False ) ######################################################## # TAB 3: ReaderLM-1 HTML-to-Markdown ######################################################## with gr.Tab("ReaderLM-1 Converter"): gr.Markdown(""" # HTML-to-Markdown with ReaderLM-1 Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b** to convert HTML to Markdown. Compare against rule-based `markdownify`. """) with gr.Row(): with gr.Column(): model_selector = gr.Dropdown( choices=list(models.keys()), label="Model", value="jinaai/reader-lm-1.5b" ) html_content = gr.Textbox( label="HTML" ) submit_btn = gr.Button(value="Submit") with gr.Column(): model_output_text = gr.Textbox(label="Reader LM Output") markdownify_output = gr.Textbox(label="Markdownify Output") # Example usage gr.Examples( examples=[ [example_html], ], inputs=[html_content], outputs=[model_output_text, markdownify_output], fn=run_example, cache_examples=True, label="Try example HTML" ) # Button event for custom input submit_btn.click( fn=run_example, inputs=[html_content, model_selector], outputs=[model_output_text, markdownify_output] ) # Finally, launch the combined demo demo.launch()