import gradio as gr import markdown from markdown.extensions.tables import TableExtension from markdown.extensions.fenced_code import FencedCodeExtension from markdown.extensions.toc import TocExtension from markdown.extensions.attr_list import AttrListExtension from markdown.extensions.codehilite import CodeHiliteExtension # For ReaderLM-2 functionality from transformers import pipeline # For ReaderLM-1 functionality from transformers import AutoTokenizer, AutoModelForCausalLM import spaces import re from markdownify import markdownify ###################################### # 1) MARKDOWN-STUDIO FUNCTIONALITY # ###################################### def render_markdown(md_text): """ Render a string of Markdown text into HTML using various Markdown extensions. - Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting. """ print("Rendering markdown input to HTML...") # Debug log return markdown.markdown( md_text, extensions=[ TableExtension(), # Adds support for Markdown tables FencedCodeExtension(), # Allows for fenced code blocks TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2 AttrListExtension(), # Enables attribute lists for elements CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks ], ) ###################################### # 2) READERLM-2 FUNCTIONALITY # ###################################### # Load the JinaAI ReaderLM-v2 model model_name = "jinaai/ReaderLM-v2" print(f"Loading model: {model_name}...") # Debug log html_converter = pipeline("text-generation", model=model_name) def convert_html(html_input, output_format, custom_prompt=None): """ Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON. - Takes raw HTML as input and converts it to the specified output format. - Allows for a custom system prompt. """ if custom_prompt: prompt = f"{custom_prompt}\n\n{html_input}" else: prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}" print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input # Use the pipeline to generate the conversion response = html_converter(prompt, max_length=9999, num_return_sequences=1) converted_output = response[0]['generated_text'] # Remove the prompt from the output to clean up the response converted_output = converted_output.replace(prompt, "").strip() print("Conversion completed.") # Debug log return converted_output ###################################### # 3) READERLM-1 FUNCTIONALITY # ###################################### # Prepare models and tokenizers for ReaderLM-1 print("Loading ReaderLM-1 models and tokenizers...") # Debug log models = { "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ).eval().to("cuda"), # Load the larger 1.5b model onto the GPU } tokenizers = { "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ), "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ), } @spaces.GPU def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): """ Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models. - Includes both model-based generation and a rule-based markdownify output. """ print(f"Running example with model: {model_id}...") # Debug log model = models[model_id] # Select the model based on the input ID tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer # Construct the chat-based input for the model messages = [{"role": "user", "content": html_content}] input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text # Tokenize the input text inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") # Generate output using the model outputs = model.generate( inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08 ) # Extract the assistant's response from the generated output pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) print("Model generation completed.") # Debug log # Use markdownify as a rule-based fallback for comparison markdownify_output = markdownify(html_content) print("Rule-based markdownify output generated.") # Debug log # Return both model-based and rule-based outputs return assistant_response[0], markdownify_output # Example HTML for ReaderLM-1 example_html = """

My To Do List

Add
""" ######################################################## # Combine everything into a single Gradio Blocks app # ######################################################## # Optional extra CSS for styling the ReaderLM-1 tab css = """ #output { height: 500px; # Set the height of the output box overflow: auto; # Enable scrolling for large content border: 1px solid #ccc; # Add a border around the box } """ # Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling print("Initializing Gradio app...") # Debug log with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo: ######################################################## # TAB 1: Markdown Suite (live preview) ######################################################## with gr.Tab("Live Preview"): gr.Markdown("# Markdown Suite") # Add a title for the tab with gr.Row(): with gr.Column(): md_input = gr.Textbox( lines=20, placeholder="Write your markdown here...", label="Markdown Input", # Input for Markdown text ) with gr.Column(): md_output = gr.HTML( label="Rendered Output" # Display the rendered HTML output ) # Update the output whenever the input changes md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output) ######################################################## # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON) ######################################################## with gr.Tab("ReaderLM-2 Converter"): gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description with gr.Row(): html_input_2 = gr.Textbox( lines=10, placeholder="Paste your raw HTML here...", label="Raw HTML Input" # Input for raw HTML ) output_format_2 = gr.Radio( ["Markdown", "JSON"], # Choose the output format label="Output Format", value="Markdown" # Default to Markdown output ) custom_prompt_2 = gr.Textbox( lines=2, placeholder="Optional: Enter a custom prompt...", label="Custom System Prompt" ) convert_btn_2 = gr.Button("Convert") # Button to trigger conversion converted_output_2 = gr.Textbox( lines=20, label="Converted Output" # Display the converted output ) # Provide usage details for the converter gr.Markdown( "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**." ) # Connect the button click event to the conversion function convert_btn_2.click( fn=convert_html, inputs=[html_input_2, output_format_2, custom_prompt_2], outputs=converted_output_2 ) # Add example inputs for demonstration gr.Examples( examples=[ ["

Hello World

This is a test.

", "Markdown", "Optional custom prompt"], ["", "JSON", "Optional custom prompt"] ], inputs=[html_input_2, output_format_2, custom_prompt_2], outputs=converted_output_2, fn=convert_html, cache_examples=False # Disable caching for dynamic examples ) ######################################################## # TAB 3: ReaderLM-1 HTML-to-Markdown ######################################################## with gr.Tab("ReaderLM-1 Converter"): gr.Markdown(""" # HTML-to-Markdown with ReaderLM-1 Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b** to convert HTML to Markdown. Compare against rule-based `markdownify`. """) with gr.Row(): with gr.Column(): model_selector = gr.Dropdown( choices=list(models.keys()), # Allow selection between the two models label="Model", value="jinaai/reader-lm-1.5b" # Default to the larger model ) html_content = gr.Textbox( label="HTML" # Input for raw HTML ) submit_btn = gr.Button(value="Submit") # Button to trigger the model with gr.Column(): model_output_text = gr.Textbox(label="Reader LM Output") # Model-generated Markdown markdownify_output = gr.Textbox(label="Markdownify Output") # Rule-based Markdown # Add example HTML input for demonstration gr.Examples( examples=[ [example_html], ], inputs=[html_content], outputs=[model_output_text, markdownify_output], fn=run_example, cache_examples=True, # Cache example outputs label="Try example HTML" ) # Connect the submit button to the run_example function submit_btn.click( fn=run_example, inputs=[html_content, model_selector], outputs=[model_output_text, markdownify_output] ) # Finally, launch the combined demo app print("Launching the demo...") # Debug log demo.launch()