import gradio as gr import markdown from markdown.extensions.tables import TableExtension from markdown.extensions.fenced_code import FencedCodeExtension from markdown.extensions.toc import TocExtension from markdown.extensions.attr_list import AttrListExtension from markdown.extensions.codehilite import CodeHiliteExtension # For ReaderLM-2 from transformers import pipeline # For ReaderLM-1 from transformers import AutoTokenizer, AutoModelForCausalLM import spaces import re from markdownify import markdownify ###################################### # 1) MARKDOWN-STUDIO FUNCTIONALITY # ###################################### def render_markdown(md_text): """ Render a string of Markdown text into HTML with a number of useful extensions. """ return markdown.markdown( md_text, extensions=[ TableExtension(), FencedCodeExtension(), TocExtension(baselevel=2), AttrListExtension(), CodeHiliteExtension(linenums=False, css_class="highlight"), ], ) ###################################### # 2) READERLM-2 FUNCTIONALITY # ###################################### # Load the JinaAI ReaderLM-v2 model model_name = "jinaai/ReaderLM-v2" html_converter = pipeline("text-generation", model=model_name) def convert_html(html_input, output_format): """ Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON. """ prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}" response = html_converter(prompt, max_length=500, num_return_sequences=1) converted_output = response[0]['generated_text'] # Remove the prompt from the start of the generated text, if present converted_output = converted_output.replace(prompt, "").strip() return converted_output ###################################### # 3) READERLM-1 FUNCTIONALITY # ###################################### # Prepare models and tokenizers models = { "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ).eval().to("cuda"), "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ).eval().to("cuda"), } tokenizers = { "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-0.5b", trust_remote_code=True ), "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained( "jinaai/reader-lm-1.5b", trust_remote_code=True ), } @spaces.GPU def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): """ Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text, then also provide a rule-based 'markdownify' output. """ model = models[model_id] tokenizer = tokenizers[model_id] # Construct the chat-based input messages = [{"role": "user", "content": html_content}] input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Tokenize inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") # Generate outputs = model.generate( inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08 ) # Extract the model's text from the response pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) # Also do a rule-based markdownify for comparison markdownify_output = markdownify(html_content) # Return the two results (model-based, rule-based) return assistant_response[0], markdownify_output # Example HTML from ReaderLM-1 example_html = """
This is a test.
", "Markdown"], ["