Spaces:
Paused
Paused
import gradio as gr | |
import markdown | |
from markdown.extensions.tables import TableExtension | |
from markdown.extensions.fenced_code import FencedCodeExtension | |
from markdown.extensions.toc import TocExtension | |
from markdown.extensions.attr_list import AttrListExtension | |
from markdown.extensions.codehilite import CodeHiliteExtension | |
# For ReaderLM-2 | |
from transformers import pipeline | |
# For ReaderLM-1 | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import spaces | |
import re | |
from markdownify import markdownify | |
###################################### | |
# 1) MARKDOWN-STUDIO FUNCTIONALITY # | |
###################################### | |
def render_markdown(md_text): | |
""" | |
Render a string of Markdown text into HTML with a number of useful extensions. | |
""" | |
return markdown.markdown( | |
md_text, | |
extensions=[ | |
TableExtension(), | |
FencedCodeExtension(), | |
TocExtension(baselevel=2), | |
AttrListExtension(), | |
CodeHiliteExtension(linenums=False, css_class="highlight"), | |
], | |
) | |
###################################### | |
# 2) READERLM-2 FUNCTIONALITY # | |
###################################### | |
# Load the JinaAI ReaderLM-v2 model | |
model_name = "jinaai/ReaderLM-v2" | |
html_converter = pipeline("text-generation", model=model_name) | |
def convert_html(html_input, output_format): | |
""" | |
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON. | |
""" | |
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}" | |
response = html_converter(prompt, max_length=500, num_return_sequences=1) | |
converted_output = response[0]['generated_text'] | |
# Remove the prompt from the start of the generated text, if present | |
converted_output = converted_output.replace(prompt, "").strip() | |
return converted_output | |
###################################### | |
# 3) READERLM-1 FUNCTIONALITY # | |
###################################### | |
# Prepare models and tokenizers | |
models = { | |
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained( | |
"jinaai/reader-lm-0.5b", trust_remote_code=True | |
).eval().to("cuda"), | |
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained( | |
"jinaai/reader-lm-1.5b", trust_remote_code=True | |
).eval().to("cuda"), | |
} | |
tokenizers = { | |
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained( | |
"jinaai/reader-lm-0.5b", trust_remote_code=True | |
), | |
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained( | |
"jinaai/reader-lm-1.5b", trust_remote_code=True | |
), | |
} | |
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): | |
""" | |
Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text, | |
then also provide a rule-based 'markdownify' output. | |
""" | |
model = models[model_id] | |
tokenizer = tokenizers[model_id] | |
# Construct the chat-based input | |
messages = [{"role": "user", "content": html_content}] | |
input_text = tokenizer.apply_chat_template(messages, tokenize=False) | |
# Tokenize | |
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") | |
# Generate | |
outputs = model.generate( | |
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08 | |
) | |
# Extract the model's text from the response | |
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" | |
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) | |
# Also do a rule-based markdownify for comparison | |
markdownify_output = markdownify(html_content) | |
# Return the two results (model-based, rule-based) | |
return assistant_response[0], markdownify_output | |
# Example HTML from ReaderLM-1 | |
example_html = """<div id="myDIV" class="header"> | |
<h2>My To Do List</h2> | |
<input type="text" id="myInput" placeholder="Title..."> | |
<span onclick="newElement()" class="addBtn">Add</span> | |
</div> | |
<ul id="myUL"> | |
<li>Hit the gym</li> | |
<li class="checked">Pay bills</li> | |
<li>Meet George</li> | |
<li>Buy eggs</li> | |
<li>Read a book</li> | |
<li>Organize office</li> | |
</ul>""" | |
######################################################## | |
# Combine everything into a single Gradio Blocks app # | |
######################################################## | |
# Optional extra CSS for the ReaderLM-1 tab | |
css = """ | |
#output { | |
height: 500px; | |
overflow: auto; | |
border: 1px solid #ccc; | |
} | |
""" | |
# We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example | |
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo: | |
######################################################## | |
# TAB 1: Markdown Suite (live preview) | |
######################################################## | |
with gr.Tab("Live Preview"): | |
gr.Markdown("# Markdown Suite") | |
with gr.Row(): | |
with gr.Column(): | |
md_input = gr.Textbox( | |
lines=20, | |
placeholder="Write your markdown here...", | |
label="Markdown Input", | |
) | |
with gr.Column(): | |
md_output = gr.HTML( | |
label="Rendered Output" | |
) | |
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output) | |
######################################################## | |
# TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON) | |
######################################################## | |
with gr.Tab("ReaderLM-2 Converter"): | |
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") | |
with gr.Row(): | |
html_input_2 = gr.Textbox( | |
lines=10, | |
placeholder="Paste your raw HTML here...", | |
label="Raw HTML Input" | |
) | |
output_format_2 = gr.Radio( | |
["Markdown", "JSON"], | |
label="Output Format", | |
value="Markdown" | |
) | |
convert_btn_2 = gr.Button("Convert") | |
converted_output_2 = gr.Textbox( | |
lines=10, | |
label="Converted Output" | |
) | |
# Provide usage details | |
gr.Markdown( | |
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**." | |
) | |
# Button event: calls convert_html | |
convert_btn_2.click( | |
fn=convert_html, | |
inputs=[html_input_2, output_format_2], | |
outputs=converted_output_2 | |
) | |
# Examples | |
gr.Examples( | |
examples=[ | |
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"], | |
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"] | |
], | |
inputs=[html_input_2, output_format_2], | |
outputs=converted_output_2, | |
fn=convert_html, | |
cache_examples=False | |
) | |
######################################################## | |
# TAB 3: ReaderLM-1 HTML-to-Markdown | |
######################################################## | |
with gr.Tab("ReaderLM-1 Converter"): | |
gr.Markdown(""" | |
# HTML-to-Markdown with ReaderLM-1 | |
Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b** | |
to convert HTML to Markdown. Compare against rule-based `markdownify`. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
model_selector = gr.Dropdown( | |
choices=list(models.keys()), | |
label="Model", | |
value="jinaai/reader-lm-1.5b" | |
) | |
html_content = gr.Textbox( | |
label="HTML" | |
) | |
submit_btn = gr.Button(value="Submit") | |
with gr.Column(): | |
model_output_text = gr.Textbox(label="Reader LM Output") | |
markdownify_output = gr.Textbox(label="Markdownify Output") | |
# Example usage | |
gr.Examples( | |
examples=[ | |
[example_html], | |
], | |
inputs=[html_content], | |
outputs=[model_output_text, markdownify_output], | |
fn=run_example, | |
cache_examples=True, | |
label="Try example HTML" | |
) | |
# Button event for custom input | |
submit_btn.click( | |
fn=run_example, | |
inputs=[html_content, model_selector], | |
outputs=[model_output_text, markdownify_output] | |
) | |
# Finally, launch the combined demo | |
demo.launch() |