Spaces:
Paused
Paused
File size: 8,409 Bytes
3cf27bd 33d554a 571f7e3 3cf27bd 33d554a 3cf27bd 33d554a 3cf27bd 33d554a 3cf27bd 33d554a 571f7e3 33d554a 571f7e3 33d554a 571f7e3 33d554a 571f7e3 33d554a 571f7e3 33d554a 571f7e3 33d554a ca58a74 33d554a 571f7e3 ca58a74 33d554a ca58a74 33d554a 3cf27bd 33d554a 3cf27bd 33d554a 571f7e3 33d554a 571f7e3 33d554a 571f7e3 33d554a 571f7e3 33d554a 571f7e3 33d554a 3cf27bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
import gradio as gr
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.toc import TocExtension
from markdown.extensions.attr_list import AttrListExtension
from markdown.extensions.codehilite import CodeHiliteExtension
# For ReaderLM-2
from transformers import pipeline
# For ReaderLM-1
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import re
from markdownify import markdownify
######################################
# 1) MARKDOWN-STUDIO FUNCTIONALITY #
######################################
def render_markdown(md_text):
"""
Render a string of Markdown text into HTML with a number of useful extensions.
"""
return markdown.markdown(
md_text,
extensions=[
TableExtension(),
FencedCodeExtension(),
TocExtension(baselevel=2),
AttrListExtension(),
CodeHiliteExtension(linenums=False, css_class="highlight"),
],
)
######################################
# 2) READERLM-2 FUNCTIONALITY #
######################################
# Load the JinaAI ReaderLM-v2 model
model_name = "jinaai/ReaderLM-v2"
html_converter = pipeline("text-generation", model=model_name)
def convert_html(html_input, output_format):
"""
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
"""
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
response = html_converter(prompt, max_length=500, num_return_sequences=1)
converted_output = response[0]['generated_text']
# Remove the prompt from the start of the generated text, if present
converted_output = converted_output.replace(prompt, "").strip()
return converted_output
######################################
# 3) READERLM-1 FUNCTIONALITY #
######################################
# Prepare models and tokenizers
models = {
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
).eval().to("cuda"),
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
).eval().to("cuda"),
}
tokenizers = {
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-0.5b", trust_remote_code=True
),
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
"jinaai/reader-lm-1.5b", trust_remote_code=True
),
}
@spaces.GPU
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
"""
Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
then also provide a rule-based 'markdownify' output.
"""
model = models[model_id]
tokenizer = tokenizers[model_id]
# Construct the chat-based input
messages = [{"role": "user", "content": html_content}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
# Tokenize
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
# Generate
outputs = model.generate(
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
)
# Extract the model's text from the response
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
# Also do a rule-based markdownify for comparison
markdownify_output = markdownify(html_content)
# Return the two results (model-based, rule-based)
return assistant_response[0], markdownify_output
# Example HTML from ReaderLM-1
example_html = """<div id="myDIV" class="header">
<h2>My To Do List</h2>
<input type="text" id="myInput" placeholder="Title...">
<span onclick="newElement()" class="addBtn">Add</span>
</div>
<ul id="myUL">
<li>Hit the gym</li>
<li class="checked">Pay bills</li>
<li>Meet George</li>
<li>Buy eggs</li>
<li>Read a book</li>
<li>Organize office</li>
</ul>"""
########################################################
# Combine everything into a single Gradio Blocks app #
########################################################
# Optional extra CSS for the ReaderLM-1 tab
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
# We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
########################################################
# TAB 1: Markdown Suite (live preview)
########################################################
with gr.Tab("Live Preview"):
gr.Markdown("# Markdown Suite")
with gr.Row():
with gr.Column():
md_input = gr.Textbox(
lines=20,
placeholder="Write your markdown here...",
label="Markdown Input",
)
with gr.Column():
md_output = gr.HTML(
label="Rendered Output"
)
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
########################################################
# TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
########################################################
with gr.Tab("ReaderLM-2 Converter"):
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
with gr.Row():
html_input_2 = gr.Textbox(
lines=10,
placeholder="Paste your raw HTML here...",
label="Raw HTML Input"
)
output_format_2 = gr.Radio(
["Markdown", "JSON"],
label="Output Format",
value="Markdown"
)
convert_btn_2 = gr.Button("Convert")
converted_output_2 = gr.Textbox(
lines=10,
label="Converted Output"
)
# Provide usage details
gr.Markdown(
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
)
# Button event: calls convert_html
convert_btn_2.click(
fn=convert_html,
inputs=[html_input_2, output_format_2],
outputs=converted_output_2
)
# Examples
gr.Examples(
examples=[
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
],
inputs=[html_input_2, output_format_2],
outputs=converted_output_2,
fn=convert_html,
cache_examples=False
)
########################################################
# TAB 3: ReaderLM-1 HTML-to-Markdown
########################################################
with gr.Tab("ReaderLM-1 Converter"):
gr.Markdown("""
# HTML-to-Markdown with ReaderLM-1
Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b**
to convert HTML to Markdown. Compare against rule-based `markdownify`.
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(
choices=list(models.keys()),
label="Model",
value="jinaai/reader-lm-1.5b"
)
html_content = gr.Textbox(
label="HTML"
)
submit_btn = gr.Button(value="Submit")
with gr.Column():
model_output_text = gr.Textbox(label="Reader LM Output")
markdownify_output = gr.Textbox(label="Markdownify Output")
# Example usage
gr.Examples(
examples=[
[example_html],
],
inputs=[html_content],
outputs=[model_output_text, markdownify_output],
fn=run_example,
cache_examples=True,
label="Try example HTML"
)
# Button event for custom input
submit_btn.click(
fn=run_example,
inputs=[html_content, model_selector],
outputs=[model_output_text, markdownify_output]
)
# Finally, launch the combined demo
demo.launch() |