Spaces:
Paused
Paused
restarting and adding readerlm-1, and markdownify inputs
Browse files
app.py
CHANGED
@@ -5,102 +5,249 @@ from markdown.extensions.fenced_code import FencedCodeExtension
|
|
5 |
from markdown.extensions.toc import TocExtension
|
6 |
from markdown.extensions.attr_list import AttrListExtension
|
7 |
from markdown.extensions.codehilite import CodeHiliteExtension
|
|
|
|
|
8 |
from transformers import pipeline
|
9 |
|
10 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def render_markdown(md_text):
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
15 |
md_text,
|
16 |
extensions=[
|
17 |
-
TableExtension(),
|
18 |
-
FencedCodeExtension(),
|
19 |
-
TocExtension(baselevel=2),
|
20 |
-
AttrListExtension(),
|
21 |
-
CodeHiliteExtension(linenums=False, css_class="highlight"),
|
22 |
],
|
23 |
)
|
24 |
-
print("[DEBUG] Rendered HTML output:", rendered_html) # Debug log for output
|
25 |
-
return rendered_html
|
26 |
|
|
|
|
|
|
|
27 |
# Load the JinaAI ReaderLM-v2 model
|
28 |
model_name = "jinaai/ReaderLM-v2"
|
29 |
-
|
30 |
-
html_converter = pipeline("text-generation", model=model_name) # Initialize the text-generation pipeline with the specified model
|
31 |
|
32 |
-
# Function to convert HTML to Markdown or JSON
|
33 |
def convert_html(html_input, output_format):
|
34 |
-
|
35 |
-
|
|
|
36 |
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
|
37 |
-
|
38 |
-
|
39 |
-
# Use the model to generate the conversion output
|
40 |
-
response = html_converter(prompt, max_length=99999, num_return_sequences=1)
|
41 |
-
print("[DEBUG] Model response:", response) # Debug log for model response
|
42 |
-
converted_output = response[0]['generated_text'] # Extract the generated text from the model response
|
43 |
|
44 |
-
# Remove the prompt
|
45 |
converted_output = converted_output.replace(prompt, "").strip()
|
46 |
-
print("[DEBUG] Converted output:", converted_output) # Debug log for the final output
|
47 |
return converted_output
|
48 |
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
with gr.Tab("Live Preview"):
|
54 |
-
gr.Markdown("# Markdown Suite")
|
55 |
|
56 |
with gr.Row():
|
57 |
with gr.Column():
|
58 |
-
# Input textbox for entering Markdown text
|
59 |
md_input = gr.Textbox(
|
60 |
-
lines=20,
|
61 |
-
placeholder="Write your markdown here...",
|
62 |
label="Markdown Input",
|
63 |
-
elem_classes=["gr-textbox"]
|
64 |
)
|
65 |
with gr.Column():
|
66 |
-
|
67 |
-
|
|
|
68 |
|
69 |
-
|
70 |
-
md_input.change(render_markdown, inputs=md_input, outputs=md_output)
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
75 |
|
76 |
with gr.Row():
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
placeholder="Paste your raw HTML here...",
|
81 |
label="Raw HTML Input"
|
82 |
)
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
"JSON"
|
88 |
-
], label="Output Format", value="Markdown")
|
89 |
-
|
90 |
-
# Output textbox to display the converted Markdown or JSON
|
91 |
-
converted_output = gr.Textbox(
|
92 |
-
lines=10,
|
93 |
label="Converted Output"
|
94 |
)
|
95 |
|
96 |
-
#
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
)
|
103 |
|
104 |
-
#
|
105 |
-
print("[DEBUG] Launching the app") # Debug log for app launch
|
106 |
demo.launch()
|
|
|
5 |
from markdown.extensions.toc import TocExtension
|
6 |
from markdown.extensions.attr_list import AttrListExtension
|
7 |
from markdown.extensions.codehilite import CodeHiliteExtension
|
8 |
+
|
9 |
+
# For ReaderLM-2
|
10 |
from transformers import pipeline
|
11 |
|
12 |
+
# For ReaderLM-1
|
13 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
14 |
+
import spaces
|
15 |
+
import re
|
16 |
+
from markdownify import markdownify
|
17 |
+
|
18 |
+
######################################
|
19 |
+
# 1) MARKDOWN-STUDIO FUNCTIONALITY #
|
20 |
+
######################################
|
21 |
def render_markdown(md_text):
|
22 |
+
"""
|
23 |
+
Render a string of Markdown text into HTML with a number of useful extensions.
|
24 |
+
"""
|
25 |
+
return markdown.markdown(
|
26 |
md_text,
|
27 |
extensions=[
|
28 |
+
TableExtension(),
|
29 |
+
FencedCodeExtension(),
|
30 |
+
TocExtension(baselevel=2),
|
31 |
+
AttrListExtension(),
|
32 |
+
CodeHiliteExtension(linenums=False, css_class="highlight"),
|
33 |
],
|
34 |
)
|
|
|
|
|
35 |
|
36 |
+
######################################
|
37 |
+
# 2) READERLM-2 FUNCTIONALITY #
|
38 |
+
######################################
|
39 |
# Load the JinaAI ReaderLM-v2 model
|
40 |
model_name = "jinaai/ReaderLM-v2"
|
41 |
+
html_converter = pipeline("text-generation", model=model_name)
|
|
|
42 |
|
|
|
43 |
def convert_html(html_input, output_format):
|
44 |
+
"""
|
45 |
+
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
|
46 |
+
"""
|
47 |
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
|
48 |
+
response = html_converter(prompt, max_length=500, num_return_sequences=1)
|
49 |
+
converted_output = response[0]['generated_text']
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
# Remove the prompt from the start of the generated text, if present
|
52 |
converted_output = converted_output.replace(prompt, "").strip()
|
|
|
53 |
return converted_output
|
54 |
|
55 |
+
######################################
|
56 |
+
# 3) READERLM-1 FUNCTIONALITY #
|
57 |
+
######################################
|
58 |
+
# Prepare models and tokenizers
|
59 |
+
models = {
|
60 |
+
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
|
61 |
+
"jinaai/reader-lm-0.5b", trust_remote_code=True
|
62 |
+
).eval().to("cuda"),
|
63 |
+
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
|
64 |
+
"jinaai/reader-lm-1.5b", trust_remote_code=True
|
65 |
+
).eval().to("cuda"),
|
66 |
+
}
|
67 |
+
tokenizers = {
|
68 |
+
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
|
69 |
+
"jinaai/reader-lm-0.5b", trust_remote_code=True
|
70 |
+
),
|
71 |
+
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
|
72 |
+
"jinaai/reader-lm-1.5b", trust_remote_code=True
|
73 |
+
),
|
74 |
+
}
|
75 |
+
|
76 |
+
@spaces.GPU
|
77 |
+
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
|
78 |
+
"""
|
79 |
+
Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
|
80 |
+
then also provide a rule-based 'markdownify' output.
|
81 |
+
"""
|
82 |
+
model = models[model_id]
|
83 |
+
tokenizer = tokenizers[model_id]
|
84 |
+
|
85 |
+
# Construct the chat-based input
|
86 |
+
messages = [{"role": "user", "content": html_content}]
|
87 |
+
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
|
88 |
+
|
89 |
+
# Tokenize
|
90 |
+
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
|
91 |
+
|
92 |
+
# Generate
|
93 |
+
outputs = model.generate(
|
94 |
+
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
|
95 |
+
)
|
96 |
+
|
97 |
+
# Extract the model's text from the response
|
98 |
+
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
|
99 |
+
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
|
100 |
|
101 |
+
# Also do a rule-based markdownify for comparison
|
102 |
+
markdownify_output = markdownify(html_content)
|
103 |
+
|
104 |
+
# Return the two results (model-based, rule-based)
|
105 |
+
return assistant_response[0], markdownify_output
|
106 |
+
|
107 |
+
# Example HTML from ReaderLM-1
|
108 |
+
example_html = """<div id="myDIV" class="header">
|
109 |
+
<h2>My To Do List</h2>
|
110 |
+
<input type="text" id="myInput" placeholder="Title...">
|
111 |
+
<span onclick="newElement()" class="addBtn">Add</span>
|
112 |
+
</div>
|
113 |
+
|
114 |
+
<ul id="myUL">
|
115 |
+
<li>Hit the gym</li>
|
116 |
+
<li class="checked">Pay bills</li>
|
117 |
+
<li>Meet George</li>
|
118 |
+
<li>Buy eggs</li>
|
119 |
+
<li>Read a book</li>
|
120 |
+
<li>Organize office</li>
|
121 |
+
</ul>"""
|
122 |
+
|
123 |
+
########################################################
|
124 |
+
# Combine everything into a single Gradio Blocks app #
|
125 |
+
########################################################
|
126 |
+
|
127 |
+
# Optional extra CSS for the ReaderLM-1 tab
|
128 |
+
css = """
|
129 |
+
#output {
|
130 |
+
height: 500px;
|
131 |
+
overflow: auto;
|
132 |
+
border: 1px solid #ccc;
|
133 |
+
}
|
134 |
+
"""
|
135 |
+
|
136 |
+
# We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
|
137 |
+
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
138 |
+
|
139 |
+
########################################################
|
140 |
+
# TAB 1: Markdown Suite (live preview)
|
141 |
+
########################################################
|
142 |
with gr.Tab("Live Preview"):
|
143 |
+
gr.Markdown("# Markdown Suite")
|
144 |
|
145 |
with gr.Row():
|
146 |
with gr.Column():
|
|
|
147 |
md_input = gr.Textbox(
|
148 |
+
lines=20,
|
149 |
+
placeholder="Write your markdown here...",
|
150 |
label="Markdown Input",
|
|
|
151 |
)
|
152 |
with gr.Column():
|
153 |
+
md_output = gr.HTML(
|
154 |
+
label="Rendered Output"
|
155 |
+
)
|
156 |
|
157 |
+
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
|
|
|
158 |
|
159 |
+
########################################################
|
160 |
+
# TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
|
161 |
+
########################################################
|
162 |
+
with gr.Tab("ReaderLM-2 Converter"):
|
163 |
+
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
|
164 |
|
165 |
with gr.Row():
|
166 |
+
html_input_2 = gr.Textbox(
|
167 |
+
lines=10,
|
168 |
+
placeholder="Paste your raw HTML here...",
|
|
|
169 |
label="Raw HTML Input"
|
170 |
)
|
171 |
+
output_format_2 = gr.Radio(
|
172 |
+
["Markdown", "JSON"],
|
173 |
+
label="Output Format",
|
174 |
+
value="Markdown"
|
175 |
+
)
|
176 |
|
177 |
+
convert_btn_2 = gr.Button("Convert")
|
178 |
+
converted_output_2 = gr.Textbox(
|
179 |
+
lines=10,
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
label="Converted Output"
|
181 |
)
|
182 |
|
183 |
+
# Provide usage details
|
184 |
+
gr.Markdown(
|
185 |
+
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
|
186 |
+
)
|
187 |
+
|
188 |
+
# Button event: calls convert_html
|
189 |
+
convert_btn_2.click(
|
190 |
+
fn=convert_html,
|
191 |
+
inputs=[html_input_2, output_format_2],
|
192 |
+
outputs=converted_output_2
|
193 |
+
)
|
194 |
+
|
195 |
+
# Examples
|
196 |
+
gr.Examples(
|
197 |
+
examples=[
|
198 |
+
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
|
199 |
+
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
|
200 |
+
],
|
201 |
+
inputs=[html_input_2, output_format_2],
|
202 |
+
outputs=converted_output_2,
|
203 |
+
fn=convert_html,
|
204 |
+
cache_examples=False
|
205 |
+
)
|
206 |
+
|
207 |
+
########################################################
|
208 |
+
# TAB 3: ReaderLM-1 HTML-to-Markdown
|
209 |
+
########################################################
|
210 |
+
with gr.Tab("ReaderLM-1 Converter"):
|
211 |
+
gr.Markdown("""
|
212 |
+
# HTML-to-Markdown with ReaderLM-1
|
213 |
+
Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b**
|
214 |
+
to convert HTML to Markdown. Compare against rule-based `markdownify`.
|
215 |
+
""")
|
216 |
+
|
217 |
+
with gr.Row():
|
218 |
+
with gr.Column():
|
219 |
+
model_selector = gr.Dropdown(
|
220 |
+
choices=list(models.keys()),
|
221 |
+
label="Model",
|
222 |
+
value="jinaai/reader-lm-1.5b"
|
223 |
+
)
|
224 |
+
html_content = gr.Textbox(
|
225 |
+
label="HTML"
|
226 |
+
)
|
227 |
+
submit_btn = gr.Button(value="Submit")
|
228 |
+
|
229 |
+
with gr.Column():
|
230 |
+
model_output_text = gr.Textbox(label="Reader LM Output")
|
231 |
+
markdownify_output = gr.Textbox(label="Markdownify Output")
|
232 |
+
|
233 |
+
# Example usage
|
234 |
+
gr.Examples(
|
235 |
+
examples=[
|
236 |
+
[example_html],
|
237 |
+
],
|
238 |
+
inputs=[html_content],
|
239 |
+
outputs=[model_output_text, markdownify_output],
|
240 |
+
fn=run_example,
|
241 |
+
cache_examples=True,
|
242 |
+
label="Try example HTML"
|
243 |
+
)
|
244 |
+
|
245 |
+
# Button event for custom input
|
246 |
+
submit_btn.click(
|
247 |
+
fn=run_example,
|
248 |
+
inputs=[html_content, model_selector],
|
249 |
+
outputs=[model_output_text, markdownify_output]
|
250 |
)
|
251 |
|
252 |
+
# Finally, launch the combined demo
|
|
|
253 |
demo.launch()
|