import gradio as gr import markdown from markdown.extensions.tables import TableExtension from markdown.extensions.fenced_code import FencedCodeExtension from markdown.extensions.toc import TocExtension from markdown.extensions.attr_list import AttrListExtension from markdown.extensions.codehilite import CodeHiliteExtension # For ReaderLM-2 functionality from transformers import pipeline # For ReaderLM-1 functionality from transformers import AutoTokenizer, AutoModelForCausalLM import spaces import re from markdownify import markdownify ###################################### # 1) MARKDOWN-STUDIO FUNCTIONALITY # ###################################### def render_markdown(md_text): """ Render a string of Markdown text into HTML using various Markdown extensions. - Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting. """ print("Rendering markdown input to HTML...") # Debug log return markdown.markdown( md_text, extensions=[ TableExtension(), # Adds support for Markdown tables FencedCodeExtension(), # Allows for fenced code blocks TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2 AttrListExtension(), # Enables attribute lists for elements CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks ], ) ###################################### # 2) READERLM-2 FUNCTIONALITY # ###################################### # Load the JinaAI ReaderLM-v2 model model_name = "jinaai/ReaderLM-v2" print(f"Loading model: {model_name}...") # Debug log html_converter = pipeline("text-generation", model=model_name) def convert_html(html_input, output_format, custom_prompt=None): """ Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON. - Takes raw HTML as input and converts it to the specified output format. - Allows for a custom system prompt. """ if custom_prompt: prompt = f"{custom_prompt}\n\n{html_input}" else: prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}" print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input # Use the pipeline to generate the conversion response = html_converter(prompt, max_length=9999, num_return_sequences=1) converted_output = response[0]['generated_text'] # Remove the prompt from the output to clean up the response converted_output = converted_output.replace(prompt, "").strip() print("Conversion completed.") # Debug log return converted_output ###################################### # 3) READERLM-1 FUNCTIONALITY # ###################################### # Prepare models and tokenizers for ReaderLM-1 print("Loading ReaderLM-1 models and tokenizers...") # Debug log models = { "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ).eval().to("cuda"), # Load the larger 1.5b model onto the GPU } tokenizers = { "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ), "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ), } @spaces.GPU def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): """ Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models. - Includes both model-based generation and a rule-based markdownify output. """ print(f"Running example with model: {model_id}...") # Debug log model = models[model_id] # Select the model based on the input ID tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer # Construct the chat-based input for the model messages = [{"role": "user", "content": html_content}] input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text # Tokenize the input text inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") # Generate output using the model outputs = model.generate( inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08 ) # Extract the assistant's response from the generated output pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) print("Model generation completed.") # Debug log # Use markdownify as a rule-based fallback for comparison markdownify_output = markdownify(html_content) print("Rule-based markdownify output generated.") # Debug log # Return both model-based and rule-based outputs return assistant_response[0], markdownify_output # Example HTML for ReaderLM-1 example_html = """
This is a test.
", "Markdown", "Optional custom prompt"], ["