Markdown-Studio / app.py
Nymbo's picture
restarting and adding readerlm-1, and markdownify inputs
33d554a verified
raw
history blame
8.41 kB
import gradio as gr
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.toc import TocExtension
from markdown.extensions.attr_list import AttrListExtension
from markdown.extensions.codehilite import CodeHiliteExtension
# For ReaderLM-2
from transformers import pipeline
# For ReaderLM-1
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify
######################################
# 1) MARKDOWN-STUDIO FUNCTIONALITY #
######################################
def render_markdown(md_text):
"""
Render a string of Markdown text into HTML with a number of useful extensions.
"""
return markdown.markdown(
md_text,
extensions=[
TableExtension(),
FencedCodeExtension(),
TocExtension(baselevel=2),
AttrListExtension(),
CodeHiliteExtension(linenums=False, css_class="highlight"),
],
)
######################################
# 2) READERLM-2 FUNCTIONALITY #
######################################
# Load the JinaAI ReaderLM-v2 model
model_name = "jinaai/ReaderLM-v2"
html_converter = pipeline("text-generation", model=model_name)
def convert_html(html_input, output_format):
"""
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
"""
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
response = html_converter(prompt, max_length=500, num_return_sequences=1)
converted_output = response[0]['generated_text']
# Remove the prompt from the start of the generated text, if present
converted_output = converted_output.replace(prompt, "").strip()
return converted_output
######################################
# 3) READERLM-1 FUNCTIONALITY #
######################################
# Prepare models and tokenizers
models = {
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
).eval().to("cuda"),
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
).eval().to("cuda"),
}
tokenizers = {
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
),
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
),
}
@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
"""
Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
then also provide a rule-based 'markdownify' output.
"""
model = models[model_id]
tokenizer = tokenizers[model_id]
# Construct the chat-based input
messages = [{"role": "user", "content": html_content}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
# Tokenize
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
# Generate
outputs = model.generate(
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
)
# Extract the model's text from the response
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
# Also do a rule-based markdownify for comparison
markdownify_output = markdownify(html_content)
# Return the two results (model-based, rule-based)
return assistant_response[0], markdownify_output
# Example HTML from ReaderLM-1
example_html = """<div id="myDIV" class="header">
<h2>My To Do List</h2>
<input type="text" id="myInput" placeholder="Title...">
<span onclick="newElement()" class="addBtn">Add</span>
</div>
<ul id="myUL">
<li>Hit the gym</li>
<li class="checked">Pay bills</li>
<li>Meet George</li>
<li>Buy eggs</li>
<li>Read a book</li>
<li>Organize office</li>
</ul>"""
########################################################
# Combine everything into a single Gradio Blocks app #
########################################################
# Optional extra CSS for the ReaderLM-1 tab
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
# We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
########################################################
# TAB 1: Markdown Suite (live preview)
########################################################
with gr.Tab("Live Preview"):
gr.Markdown("# Markdown Suite")
with gr.Row():
with gr.Column():
md_input = gr.Textbox(
lines=20,
placeholder="Write your markdown here...",
label="Markdown Input",
)
with gr.Column():
md_output = gr.HTML(
label="Rendered Output"
)
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
########################################################
# TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
########################################################
with gr.Tab("ReaderLM-2 Converter"):
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
with gr.Row():
html_input_2 = gr.Textbox(
lines=10,
placeholder="Paste your raw HTML here...",
label="Raw HTML Input"
)
output_format_2 = gr.Radio(
["Markdown", "JSON"],
label="Output Format",
value="Markdown"
)
convert_btn_2 = gr.Button("Convert")
converted_output_2 = gr.Textbox(
lines=10,
label="Converted Output"
)
# Provide usage details
gr.Markdown(
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
)
# Button event: calls convert_html
convert_btn_2.click(
fn=convert_html,
inputs=[html_input_2, output_format_2],
outputs=converted_output_2
)
# Examples
gr.Examples(
examples=[
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
],
inputs=[html_input_2, output_format_2],
outputs=converted_output_2,
fn=convert_html,
cache_examples=False
)
########################################################
# TAB 3: ReaderLM-1 HTML-to-Markdown
########################################################
with gr.Tab("ReaderLM-1 Converter"):
gr.Markdown("""
# HTML-to-Markdown with ReaderLM-1
Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b**
to convert HTML to Markdown. Compare against rule-based `markdownify`.
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(
choices=list(models.keys()),
label="Model",
value="jinaai/reader-lm-1.5b"
)
html_content = gr.Textbox(
label="HTML"
)
submit_btn = gr.Button(value="Submit")
with gr.Column():
model_output_text = gr.Textbox(label="Reader LM Output")
markdownify_output = gr.Textbox(label="Markdownify Output")
# Example usage
gr.Examples(
examples=[
[example_html],
],
inputs=[html_content],
outputs=[model_output_text, markdownify_output],
fn=run_example,
cache_examples=True,
label="Try example HTML"
)
# Button event for custom input
submit_btn.click(
fn=run_example,
inputs=[html_content, model_selector],
outputs=[model_output_text, markdownify_output]
)
# Finally, launch the combined demo
demo.launch()