Spaces:
Paused
Paused
debug logs and adding system prompt override textbox for ReaderLM-2
Browse files
app.py
CHANGED
@@ -6,10 +6,10 @@ from markdown.extensions.toc import TocExtension
|
|
6 |
from markdown.extensions.attr_list import AttrListExtension
|
7 |
from markdown.extensions.codehilite import CodeHiliteExtension
|
8 |
|
9 |
-
# For ReaderLM-2
|
10 |
from transformers import pipeline
|
11 |
|
12 |
-
# For ReaderLM-1
|
13 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
14 |
import spaces
|
15 |
import re
|
@@ -20,16 +20,18 @@ from markdownify import markdownify
|
|
20 |
######################################
|
21 |
def render_markdown(md_text):
|
22 |
"""
|
23 |
-
Render a string of Markdown text into HTML
|
|
|
24 |
"""
|
|
|
25 |
return markdown.markdown(
|
26 |
md_text,
|
27 |
extensions=[
|
28 |
-
TableExtension(),
|
29 |
-
FencedCodeExtension(),
|
30 |
-
TocExtension(baselevel=2),
|
31 |
-
AttrListExtension(),
|
32 |
-
CodeHiliteExtension(linenums=False, css_class="highlight"),
|
33 |
],
|
34 |
)
|
35 |
|
@@ -38,34 +40,44 @@ def render_markdown(md_text):
|
|
38 |
######################################
|
39 |
# Load the JinaAI ReaderLM-v2 model
|
40 |
model_name = "jinaai/ReaderLM-v2"
|
|
|
41 |
html_converter = pipeline("text-generation", model=model_name)
|
42 |
|
43 |
-
def convert_html(html_input, output_format):
|
44 |
"""
|
45 |
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
|
|
|
|
|
46 |
"""
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
response = html_converter(prompt, max_length=9999, num_return_sequences=1)
|
52 |
converted_output = response[0]['generated_text']
|
53 |
-
|
54 |
-
# Remove the prompt from the
|
55 |
converted_output = converted_output.replace(prompt, "").strip()
|
|
|
56 |
return converted_output
|
57 |
|
58 |
######################################
|
59 |
# 3) READERLM-1 FUNCTIONALITY #
|
60 |
######################################
|
61 |
-
# Prepare models and tokenizers
|
|
|
62 |
models = {
|
63 |
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
|
64 |
"jinaai/reader-lm-0.5b", trust_remote_code=True
|
65 |
-
).eval().to("cuda"),
|
66 |
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
|
67 |
"jinaai/reader-lm-1.5b", trust_remote_code=True
|
68 |
-
).eval().to("cuda"),
|
69 |
}
|
70 |
tokenizers = {
|
71 |
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
|
@@ -79,35 +91,39 @@ tokenizers = {
|
|
79 |
@spaces.GPU
|
80 |
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
|
81 |
"""
|
82 |
-
|
83 |
-
|
84 |
"""
|
85 |
-
model
|
86 |
-
|
|
|
87 |
|
88 |
-
# Construct the chat-based input
|
89 |
messages = [{"role": "user", "content": html_content}]
|
90 |
-
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
|
|
|
91 |
|
92 |
-
# Tokenize
|
93 |
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
|
94 |
-
|
95 |
-
# Generate
|
96 |
outputs = model.generate(
|
97 |
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
|
98 |
)
|
99 |
|
100 |
-
# Extract the
|
101 |
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
|
102 |
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
|
|
|
103 |
|
104 |
-
#
|
105 |
markdownify_output = markdownify(html_content)
|
|
|
106 |
|
107 |
-
# Return
|
108 |
return assistant_response[0], markdownify_output
|
109 |
|
110 |
-
# Example HTML
|
111 |
example_html = """<div id="myDIV" class="header">
|
112 |
<h2>My To Do List</h2>
|
113 |
<input type="text" id="myInput" placeholder="Title...">
|
@@ -127,84 +143,91 @@ example_html = """<div id="myDIV" class="header">
|
|
127 |
# Combine everything into a single Gradio Blocks app #
|
128 |
########################################################
|
129 |
|
130 |
-
# Optional extra CSS for the ReaderLM-1 tab
|
131 |
css = """
|
132 |
#output {
|
133 |
-
height: 500px;
|
134 |
-
overflow: auto;
|
135 |
-
border: 1px solid #ccc;
|
136 |
}
|
137 |
"""
|
138 |
|
139 |
-
#
|
|
|
140 |
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
141 |
|
142 |
########################################################
|
143 |
# TAB 1: Markdown Suite (live preview)
|
144 |
########################################################
|
145 |
with gr.Tab("Live Preview"):
|
146 |
-
gr.Markdown("# Markdown Suite")
|
147 |
|
148 |
with gr.Row():
|
149 |
with gr.Column():
|
150 |
md_input = gr.Textbox(
|
151 |
lines=20,
|
152 |
placeholder="Write your markdown here...",
|
153 |
-
label="Markdown Input",
|
154 |
)
|
155 |
with gr.Column():
|
156 |
md_output = gr.HTML(
|
157 |
-
label="Rendered Output"
|
158 |
)
|
159 |
|
|
|
160 |
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
|
161 |
|
162 |
########################################################
|
163 |
# TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
|
164 |
########################################################
|
165 |
with gr.Tab("ReaderLM-2 Converter"):
|
166 |
-
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
|
167 |
|
168 |
with gr.Row():
|
169 |
html_input_2 = gr.Textbox(
|
170 |
lines=10,
|
171 |
placeholder="Paste your raw HTML here...",
|
172 |
-
label="Raw HTML Input"
|
173 |
)
|
174 |
output_format_2 = gr.Radio(
|
175 |
-
["Markdown", "JSON"],
|
176 |
label="Output Format",
|
177 |
-
value="Markdown"
|
|
|
|
|
|
|
|
|
|
|
178 |
)
|
179 |
|
180 |
-
convert_btn_2 = gr.Button("Convert")
|
181 |
converted_output_2 = gr.Textbox(
|
182 |
lines=20,
|
183 |
-
label="Converted Output"
|
184 |
)
|
185 |
|
186 |
-
# Provide usage details
|
187 |
gr.Markdown(
|
188 |
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
|
189 |
)
|
190 |
|
191 |
-
#
|
192 |
convert_btn_2.click(
|
193 |
fn=convert_html,
|
194 |
-
inputs=[html_input_2, output_format_2],
|
195 |
outputs=converted_output_2
|
196 |
)
|
197 |
|
198 |
-
#
|
199 |
gr.Examples(
|
200 |
examples=[
|
201 |
-
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
|
202 |
-
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
|
203 |
],
|
204 |
-
inputs=[html_input_2, output_format_2],
|
205 |
outputs=converted_output_2,
|
206 |
fn=convert_html,
|
207 |
-
cache_examples=False
|
208 |
)
|
209 |
|
210 |
########################################################
|
@@ -220,20 +243,20 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
|
220 |
with gr.Row():
|
221 |
with gr.Column():
|
222 |
model_selector = gr.Dropdown(
|
223 |
-
choices=list(models.keys()),
|
224 |
label="Model",
|
225 |
-
value="jinaai/reader-lm-1.5b"
|
226 |
)
|
227 |
html_content = gr.Textbox(
|
228 |
-
label="HTML"
|
229 |
)
|
230 |
-
submit_btn = gr.Button(value="Submit")
|
231 |
|
232 |
with gr.Column():
|
233 |
-
model_output_text = gr.Textbox(label="Reader LM Output")
|
234 |
-
markdownify_output = gr.Textbox(label="Markdownify Output")
|
235 |
|
236 |
-
#
|
237 |
gr.Examples(
|
238 |
examples=[
|
239 |
[example_html],
|
@@ -241,16 +264,17 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
|
241 |
inputs=[html_content],
|
242 |
outputs=[model_output_text, markdownify_output],
|
243 |
fn=run_example,
|
244 |
-
cache_examples=True,
|
245 |
label="Try example HTML"
|
246 |
)
|
247 |
|
248 |
-
#
|
249 |
submit_btn.click(
|
250 |
fn=run_example,
|
251 |
inputs=[html_content, model_selector],
|
252 |
outputs=[model_output_text, markdownify_output]
|
253 |
)
|
254 |
|
255 |
-
# Finally, launch the combined demo
|
|
|
256 |
demo.launch()
|
|
|
6 |
from markdown.extensions.attr_list import AttrListExtension
|
7 |
from markdown.extensions.codehilite import CodeHiliteExtension
|
8 |
|
9 |
+
# For ReaderLM-2 functionality
|
10 |
from transformers import pipeline
|
11 |
|
12 |
+
# For ReaderLM-1 functionality
|
13 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
14 |
import spaces
|
15 |
import re
|
|
|
20 |
######################################
|
21 |
def render_markdown(md_text):
|
22 |
"""
|
23 |
+
Render a string of Markdown text into HTML using various Markdown extensions.
|
24 |
+
- Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
|
25 |
"""
|
26 |
+
print("Rendering markdown input to HTML...") # Debug log
|
27 |
return markdown.markdown(
|
28 |
md_text,
|
29 |
extensions=[
|
30 |
+
TableExtension(), # Adds support for Markdown tables
|
31 |
+
FencedCodeExtension(), # Allows for fenced code blocks
|
32 |
+
TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2
|
33 |
+
AttrListExtension(), # Enables attribute lists for elements
|
34 |
+
CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks
|
35 |
],
|
36 |
)
|
37 |
|
|
|
40 |
######################################
|
41 |
# Load the JinaAI ReaderLM-v2 model
|
42 |
model_name = "jinaai/ReaderLM-v2"
|
43 |
+
print(f"Loading model: {model_name}...") # Debug log
|
44 |
html_converter = pipeline("text-generation", model=model_name)
|
45 |
|
46 |
+
def convert_html(html_input, output_format, custom_prompt=None):
|
47 |
"""
|
48 |
Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
|
49 |
+
- Takes raw HTML as input and converts it to the specified output format.
|
50 |
+
- Allows for a custom system prompt.
|
51 |
"""
|
52 |
+
if custom_prompt:
|
53 |
+
prompt = f"{custom_prompt}\n\n{html_input}"
|
54 |
+
else:
|
55 |
+
prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
|
56 |
+
|
57 |
+
print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log
|
58 |
+
print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input
|
59 |
+
|
60 |
+
# Use the pipeline to generate the conversion
|
61 |
response = html_converter(prompt, max_length=9999, num_return_sequences=1)
|
62 |
converted_output = response[0]['generated_text']
|
63 |
+
|
64 |
+
# Remove the prompt from the output to clean up the response
|
65 |
converted_output = converted_output.replace(prompt, "").strip()
|
66 |
+
print("Conversion completed.") # Debug log
|
67 |
return converted_output
|
68 |
|
69 |
######################################
|
70 |
# 3) READERLM-1 FUNCTIONALITY #
|
71 |
######################################
|
72 |
+
# Prepare models and tokenizers for ReaderLM-1
|
73 |
+
print("Loading ReaderLM-1 models and tokenizers...") # Debug log
|
74 |
models = {
|
75 |
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
|
76 |
"jinaai/reader-lm-0.5b", trust_remote_code=True
|
77 |
+
).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU
|
78 |
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
|
79 |
"jinaai/reader-lm-1.5b", trust_remote_code=True
|
80 |
+
).eval().to("cuda"), # Load the larger 1.5b model onto the GPU
|
81 |
}
|
82 |
tokenizers = {
|
83 |
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
|
|
|
91 |
@spaces.GPU
|
92 |
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
|
93 |
"""
|
94 |
+
Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
|
95 |
+
- Includes both model-based generation and a rule-based markdownify output.
|
96 |
"""
|
97 |
+
print(f"Running example with model: {model_id}...") # Debug log
|
98 |
+
model = models[model_id] # Select the model based on the input ID
|
99 |
+
tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer
|
100 |
|
101 |
+
# Construct the chat-based input for the model
|
102 |
messages = [{"role": "user", "content": html_content}]
|
103 |
+
input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model
|
104 |
+
print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text
|
105 |
|
106 |
+
# Tokenize the input text
|
107 |
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
|
108 |
+
|
109 |
+
# Generate output using the model
|
110 |
outputs = model.generate(
|
111 |
inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
|
112 |
)
|
113 |
|
114 |
+
# Extract the assistant's response from the generated output
|
115 |
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
|
116 |
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
|
117 |
+
print("Model generation completed.") # Debug log
|
118 |
|
119 |
+
# Use markdownify as a rule-based fallback for comparison
|
120 |
markdownify_output = markdownify(html_content)
|
121 |
+
print("Rule-based markdownify output generated.") # Debug log
|
122 |
|
123 |
+
# Return both model-based and rule-based outputs
|
124 |
return assistant_response[0], markdownify_output
|
125 |
|
126 |
+
# Example HTML for ReaderLM-1
|
127 |
example_html = """<div id="myDIV" class="header">
|
128 |
<h2>My To Do List</h2>
|
129 |
<input type="text" id="myInput" placeholder="Title...">
|
|
|
143 |
# Combine everything into a single Gradio Blocks app #
|
144 |
########################################################
|
145 |
|
146 |
+
# Optional extra CSS for styling the ReaderLM-1 tab
|
147 |
css = """
|
148 |
#output {
|
149 |
+
height: 500px; # Set the height of the output box
|
150 |
+
overflow: auto; # Enable scrolling for large content
|
151 |
+
border: 1px solid #ccc; # Add a border around the box
|
152 |
}
|
153 |
"""
|
154 |
|
155 |
+
# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
|
156 |
+
print("Initializing Gradio app...") # Debug log
|
157 |
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
158 |
|
159 |
########################################################
|
160 |
# TAB 1: Markdown Suite (live preview)
|
161 |
########################################################
|
162 |
with gr.Tab("Live Preview"):
|
163 |
+
gr.Markdown("# Markdown Suite") # Add a title for the tab
|
164 |
|
165 |
with gr.Row():
|
166 |
with gr.Column():
|
167 |
md_input = gr.Textbox(
|
168 |
lines=20,
|
169 |
placeholder="Write your markdown here...",
|
170 |
+
label="Markdown Input", # Input for Markdown text
|
171 |
)
|
172 |
with gr.Column():
|
173 |
md_output = gr.HTML(
|
174 |
+
label="Rendered Output" # Display the rendered HTML output
|
175 |
)
|
176 |
|
177 |
+
# Update the output whenever the input changes
|
178 |
md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
|
179 |
|
180 |
########################################################
|
181 |
# TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
|
182 |
########################################################
|
183 |
with gr.Tab("ReaderLM-2 Converter"):
|
184 |
+
gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
|
185 |
|
186 |
with gr.Row():
|
187 |
html_input_2 = gr.Textbox(
|
188 |
lines=10,
|
189 |
placeholder="Paste your raw HTML here...",
|
190 |
+
label="Raw HTML Input" # Input for raw HTML
|
191 |
)
|
192 |
output_format_2 = gr.Radio(
|
193 |
+
["Markdown", "JSON"], # Choose the output format
|
194 |
label="Output Format",
|
195 |
+
value="Markdown" # Default to Markdown output
|
196 |
+
)
|
197 |
+
custom_prompt_2 = gr.Textbox(
|
198 |
+
lines=2,
|
199 |
+
placeholder="Optional: Enter a custom prompt...",
|
200 |
+
label="Custom System Prompt"
|
201 |
)
|
202 |
|
203 |
+
convert_btn_2 = gr.Button("Convert") # Button to trigger conversion
|
204 |
converted_output_2 = gr.Textbox(
|
205 |
lines=20,
|
206 |
+
label="Converted Output" # Display the converted output
|
207 |
)
|
208 |
|
209 |
+
# Provide usage details for the converter
|
210 |
gr.Markdown(
|
211 |
"Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
|
212 |
)
|
213 |
|
214 |
+
# Connect the button click event to the conversion function
|
215 |
convert_btn_2.click(
|
216 |
fn=convert_html,
|
217 |
+
inputs=[html_input_2, output_format_2, custom_prompt_2],
|
218 |
outputs=converted_output_2
|
219 |
)
|
220 |
|
221 |
+
# Add example inputs for demonstration
|
222 |
gr.Examples(
|
223 |
examples=[
|
224 |
+
["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
|
225 |
+
["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
|
226 |
],
|
227 |
+
inputs=[html_input_2, output_format_2, custom_prompt_2],
|
228 |
outputs=converted_output_2,
|
229 |
fn=convert_html,
|
230 |
+
cache_examples=False # Disable caching for dynamic examples
|
231 |
)
|
232 |
|
233 |
########################################################
|
|
|
243 |
with gr.Row():
|
244 |
with gr.Column():
|
245 |
model_selector = gr.Dropdown(
|
246 |
+
choices=list(models.keys()), # Allow selection between the two models
|
247 |
label="Model",
|
248 |
+
value="jinaai/reader-lm-1.5b" # Default to the larger model
|
249 |
)
|
250 |
html_content = gr.Textbox(
|
251 |
+
label="HTML" # Input for raw HTML
|
252 |
)
|
253 |
+
submit_btn = gr.Button(value="Submit") # Button to trigger the model
|
254 |
|
255 |
with gr.Column():
|
256 |
+
model_output_text = gr.Textbox(label="Reader LM Output") # Model-generated Markdown
|
257 |
+
markdownify_output = gr.Textbox(label="Markdownify Output") # Rule-based Markdown
|
258 |
|
259 |
+
# Add example HTML input for demonstration
|
260 |
gr.Examples(
|
261 |
examples=[
|
262 |
[example_html],
|
|
|
264 |
inputs=[html_content],
|
265 |
outputs=[model_output_text, markdownify_output],
|
266 |
fn=run_example,
|
267 |
+
cache_examples=True, # Cache example outputs
|
268 |
label="Try example HTML"
|
269 |
)
|
270 |
|
271 |
+
# Connect the submit button to the run_example function
|
272 |
submit_btn.click(
|
273 |
fn=run_example,
|
274 |
inputs=[html_content, model_selector],
|
275 |
outputs=[model_output_text, markdownify_output]
|
276 |
)
|
277 |
|
278 |
+
# Finally, launch the combined demo app
|
279 |
+
print("Launching the demo...") # Debug log
|
280 |
demo.launch()
|