Nymbo commited on
Commit
36709a2
·
verified ·
1 Parent(s): 52d93e9

debug logs and adding system prompt override textbox for ReaderLM-2

Browse files
Files changed (1) hide show
  1. app.py +87 -63
app.py CHANGED
@@ -6,10 +6,10 @@ from markdown.extensions.toc import TocExtension
6
  from markdown.extensions.attr_list import AttrListExtension
7
  from markdown.extensions.codehilite import CodeHiliteExtension
8
 
9
- # For ReaderLM-2
10
  from transformers import pipeline
11
 
12
- # For ReaderLM-1
13
  from transformers import AutoTokenizer, AutoModelForCausalLM
14
  import spaces
15
  import re
@@ -20,16 +20,18 @@ from markdownify import markdownify
20
  ######################################
21
  def render_markdown(md_text):
22
  """
23
- Render a string of Markdown text into HTML with a number of useful extensions.
 
24
  """
 
25
  return markdown.markdown(
26
  md_text,
27
  extensions=[
28
- TableExtension(),
29
- FencedCodeExtension(),
30
- TocExtension(baselevel=2),
31
- AttrListExtension(),
32
- CodeHiliteExtension(linenums=False, css_class="highlight"),
33
  ],
34
  )
35
 
@@ -38,34 +40,44 @@ def render_markdown(md_text):
38
  ######################################
39
  # Load the JinaAI ReaderLM-v2 model
40
  model_name = "jinaai/ReaderLM-v2"
 
41
  html_converter = pipeline("text-generation", model=model_name)
42
 
43
- def convert_html(html_input, output_format):
44
  """
45
  Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
 
 
46
  """
47
-
48
- # ReaderLM-2 System Prompt
49
-
50
- prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
 
 
 
 
 
51
  response = html_converter(prompt, max_length=9999, num_return_sequences=1)
52
  converted_output = response[0]['generated_text']
53
-
54
- # Remove the prompt from the start of the generated text, if present
55
  converted_output = converted_output.replace(prompt, "").strip()
 
56
  return converted_output
57
 
58
  ######################################
59
  # 3) READERLM-1 FUNCTIONALITY #
60
  ######################################
61
- # Prepare models and tokenizers
 
62
  models = {
63
  "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
64
  "jinaai/reader-lm-0.5b", trust_remote_code=True
65
- ).eval().to("cuda"),
66
  "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
67
  "jinaai/reader-lm-1.5b", trust_remote_code=True
68
- ).eval().to("cuda"),
69
  }
70
  tokenizers = {
71
  "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
@@ -79,35 +91,39 @@ tokenizers = {
79
  @spaces.GPU
80
  def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
81
  """
82
- Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
83
- then also provide a rule-based 'markdownify' output.
84
  """
85
- model = models[model_id]
86
- tokenizer = tokenizers[model_id]
 
87
 
88
- # Construct the chat-based input
89
  messages = [{"role": "user", "content": html_content}]
90
- input_text = tokenizer.apply_chat_template(messages, tokenize=False)
 
91
 
92
- # Tokenize
93
  inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
94
-
95
- # Generate
96
  outputs = model.generate(
97
  inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
98
  )
99
 
100
- # Extract the model's text from the response
101
  pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
102
  assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
 
103
 
104
- # Also do a rule-based markdownify for comparison
105
  markdownify_output = markdownify(html_content)
 
106
 
107
- # Return the two results (model-based, rule-based)
108
  return assistant_response[0], markdownify_output
109
 
110
- # Example HTML from ReaderLM-1
111
  example_html = """<div id="myDIV" class="header">
112
  <h2>My To Do List</h2>
113
  <input type="text" id="myInput" placeholder="Title...">
@@ -127,84 +143,91 @@ example_html = """<div id="myDIV" class="header">
127
  # Combine everything into a single Gradio Blocks app #
128
  ########################################################
129
 
130
- # Optional extra CSS for the ReaderLM-1 tab
131
  css = """
132
  #output {
133
- height: 500px;
134
- overflow: auto;
135
- border: 1px solid #ccc;
136
  }
137
  """
138
 
139
- # We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
 
140
  with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
141
 
142
  ########################################################
143
  # TAB 1: Markdown Suite (live preview)
144
  ########################################################
145
  with gr.Tab("Live Preview"):
146
- gr.Markdown("# Markdown Suite")
147
 
148
  with gr.Row():
149
  with gr.Column():
150
  md_input = gr.Textbox(
151
  lines=20,
152
  placeholder="Write your markdown here...",
153
- label="Markdown Input",
154
  )
155
  with gr.Column():
156
  md_output = gr.HTML(
157
- label="Rendered Output"
158
  )
159
 
 
160
  md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
161
 
162
  ########################################################
163
  # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
164
  ########################################################
165
  with gr.Tab("ReaderLM-2 Converter"):
166
- gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
167
 
168
  with gr.Row():
169
  html_input_2 = gr.Textbox(
170
  lines=10,
171
  placeholder="Paste your raw HTML here...",
172
- label="Raw HTML Input"
173
  )
174
  output_format_2 = gr.Radio(
175
- ["Markdown", "JSON"],
176
  label="Output Format",
177
- value="Markdown"
 
 
 
 
 
178
  )
179
 
180
- convert_btn_2 = gr.Button("Convert")
181
  converted_output_2 = gr.Textbox(
182
  lines=20,
183
- label="Converted Output"
184
  )
185
 
186
- # Provide usage details
187
  gr.Markdown(
188
  "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
189
  )
190
 
191
- # Button event: calls convert_html
192
  convert_btn_2.click(
193
  fn=convert_html,
194
- inputs=[html_input_2, output_format_2],
195
  outputs=converted_output_2
196
  )
197
 
198
- # Examples
199
  gr.Examples(
200
  examples=[
201
- ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
202
- ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
203
  ],
204
- inputs=[html_input_2, output_format_2],
205
  outputs=converted_output_2,
206
  fn=convert_html,
207
- cache_examples=False
208
  )
209
 
210
  ########################################################
@@ -220,20 +243,20 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
220
  with gr.Row():
221
  with gr.Column():
222
  model_selector = gr.Dropdown(
223
- choices=list(models.keys()),
224
  label="Model",
225
- value="jinaai/reader-lm-1.5b"
226
  )
227
  html_content = gr.Textbox(
228
- label="HTML"
229
  )
230
- submit_btn = gr.Button(value="Submit")
231
 
232
  with gr.Column():
233
- model_output_text = gr.Textbox(label="Reader LM Output")
234
- markdownify_output = gr.Textbox(label="Markdownify Output")
235
 
236
- # Example usage
237
  gr.Examples(
238
  examples=[
239
  [example_html],
@@ -241,16 +264,17 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
241
  inputs=[html_content],
242
  outputs=[model_output_text, markdownify_output],
243
  fn=run_example,
244
- cache_examples=True,
245
  label="Try example HTML"
246
  )
247
 
248
- # Button event for custom input
249
  submit_btn.click(
250
  fn=run_example,
251
  inputs=[html_content, model_selector],
252
  outputs=[model_output_text, markdownify_output]
253
  )
254
 
255
- # Finally, launch the combined demo
 
256
  demo.launch()
 
6
  from markdown.extensions.attr_list import AttrListExtension
7
  from markdown.extensions.codehilite import CodeHiliteExtension
8
 
9
+ # For ReaderLM-2 functionality
10
  from transformers import pipeline
11
 
12
+ # For ReaderLM-1 functionality
13
  from transformers import AutoTokenizer, AutoModelForCausalLM
14
  import spaces
15
  import re
 
20
  ######################################
21
  def render_markdown(md_text):
22
  """
23
+ Render a string of Markdown text into HTML using various Markdown extensions.
24
+ - Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
25
  """
26
+ print("Rendering markdown input to HTML...") # Debug log
27
  return markdown.markdown(
28
  md_text,
29
  extensions=[
30
+ TableExtension(), # Adds support for Markdown tables
31
+ FencedCodeExtension(), # Allows for fenced code blocks
32
+ TocExtension(baselevel=2), # Generates a Table of Contents starting at level 2
33
+ AttrListExtension(), # Enables attribute lists for elements
34
+ CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks
35
  ],
36
  )
37
 
 
40
  ######################################
41
  # Load the JinaAI ReaderLM-v2 model
42
  model_name = "jinaai/ReaderLM-v2"
43
+ print(f"Loading model: {model_name}...") # Debug log
44
  html_converter = pipeline("text-generation", model=model_name)
45
 
46
+ def convert_html(html_input, output_format, custom_prompt=None):
47
  """
48
  Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
49
+ - Takes raw HTML as input and converts it to the specified output format.
50
+ - Allows for a custom system prompt.
51
  """
52
+ if custom_prompt:
53
+ prompt = f"{custom_prompt}\n\n{html_input}"
54
+ else:
55
+ prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
56
+
57
+ print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...") # Debug log
58
+ print(f"HTML input: {html_input[:100]}...") # Debug log, preview first 100 characters of input
59
+
60
+ # Use the pipeline to generate the conversion
61
  response = html_converter(prompt, max_length=9999, num_return_sequences=1)
62
  converted_output = response[0]['generated_text']
63
+
64
+ # Remove the prompt from the output to clean up the response
65
  converted_output = converted_output.replace(prompt, "").strip()
66
+ print("Conversion completed.") # Debug log
67
  return converted_output
68
 
69
  ######################################
70
  # 3) READERLM-1 FUNCTIONALITY #
71
  ######################################
72
+ # Prepare models and tokenizers for ReaderLM-1
73
+ print("Loading ReaderLM-1 models and tokenizers...") # Debug log
74
  models = {
75
  "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
76
  "jinaai/reader-lm-0.5b", trust_remote_code=True
77
+ ).eval().to("cuda"), # Load the smaller 0.5b model onto the GPU
78
  "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
79
  "jinaai/reader-lm-1.5b", trust_remote_code=True
80
+ ).eval().to("cuda"), # Load the larger 1.5b model onto the GPU
81
  }
82
  tokenizers = {
83
  "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
 
91
  @spaces.GPU
92
  def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
93
  """
94
+ Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
95
+ - Includes both model-based generation and a rule-based markdownify output.
96
  """
97
+ print(f"Running example with model: {model_id}...") # Debug log
98
+ model = models[model_id] # Select the model based on the input ID
99
+ tokenizer = tokenizers[model_id] # Retrieve the corresponding tokenizer
100
 
101
+ # Construct the chat-based input for the model
102
  messages = [{"role": "user", "content": html_content}]
103
+ input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Format input text for the model
104
+ print(f"Generated input text for model: {input_text[:100]}...") # Debug log, preview input text
105
 
106
+ # Tokenize the input text
107
  inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
108
+
109
+ # Generate output using the model
110
  outputs = model.generate(
111
  inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
112
  )
113
 
114
+ # Extract the assistant's response from the generated output
115
  pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
116
  assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
117
+ print("Model generation completed.") # Debug log
118
 
119
+ # Use markdownify as a rule-based fallback for comparison
120
  markdownify_output = markdownify(html_content)
121
+ print("Rule-based markdownify output generated.") # Debug log
122
 
123
+ # Return both model-based and rule-based outputs
124
  return assistant_response[0], markdownify_output
125
 
126
+ # Example HTML for ReaderLM-1
127
  example_html = """<div id="myDIV" class="header">
128
  <h2>My To Do List</h2>
129
  <input type="text" id="myInput" placeholder="Title...">
 
143
  # Combine everything into a single Gradio Blocks app #
144
  ########################################################
145
 
146
+ # Optional extra CSS for styling the ReaderLM-1 tab
147
  css = """
148
  #output {
149
+ height: 500px; # Set the height of the output box
150
+ overflow: auto; # Enable scrolling for large content
151
+ border: 1px solid #ccc; # Add a border around the box
152
  }
153
  """
154
 
155
+ # Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
156
+ print("Initializing Gradio app...") # Debug log
157
  with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
158
 
159
  ########################################################
160
  # TAB 1: Markdown Suite (live preview)
161
  ########################################################
162
  with gr.Tab("Live Preview"):
163
+ gr.Markdown("# Markdown Suite") # Add a title for the tab
164
 
165
  with gr.Row():
166
  with gr.Column():
167
  md_input = gr.Textbox(
168
  lines=20,
169
  placeholder="Write your markdown here...",
170
+ label="Markdown Input", # Input for Markdown text
171
  )
172
  with gr.Column():
173
  md_output = gr.HTML(
174
+ label="Rendered Output" # Display the rendered HTML output
175
  )
176
 
177
+ # Update the output whenever the input changes
178
  md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
179
 
180
  ########################################################
181
  # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
182
  ########################################################
183
  with gr.Tab("ReaderLM-2 Converter"):
184
+ gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)") # Tab description
185
 
186
  with gr.Row():
187
  html_input_2 = gr.Textbox(
188
  lines=10,
189
  placeholder="Paste your raw HTML here...",
190
+ label="Raw HTML Input" # Input for raw HTML
191
  )
192
  output_format_2 = gr.Radio(
193
+ ["Markdown", "JSON"], # Choose the output format
194
  label="Output Format",
195
+ value="Markdown" # Default to Markdown output
196
+ )
197
+ custom_prompt_2 = gr.Textbox(
198
+ lines=2,
199
+ placeholder="Optional: Enter a custom prompt...",
200
+ label="Custom System Prompt"
201
  )
202
 
203
+ convert_btn_2 = gr.Button("Convert") # Button to trigger conversion
204
  converted_output_2 = gr.Textbox(
205
  lines=20,
206
+ label="Converted Output" # Display the converted output
207
  )
208
 
209
+ # Provide usage details for the converter
210
  gr.Markdown(
211
  "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
212
  )
213
 
214
+ # Connect the button click event to the conversion function
215
  convert_btn_2.click(
216
  fn=convert_html,
217
+ inputs=[html_input_2, output_format_2, custom_prompt_2],
218
  outputs=converted_output_2
219
  )
220
 
221
+ # Add example inputs for demonstration
222
  gr.Examples(
223
  examples=[
224
+ ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
225
+ ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
226
  ],
227
+ inputs=[html_input_2, output_format_2, custom_prompt_2],
228
  outputs=converted_output_2,
229
  fn=convert_html,
230
+ cache_examples=False # Disable caching for dynamic examples
231
  )
232
 
233
  ########################################################
 
243
  with gr.Row():
244
  with gr.Column():
245
  model_selector = gr.Dropdown(
246
+ choices=list(models.keys()), # Allow selection between the two models
247
  label="Model",
248
+ value="jinaai/reader-lm-1.5b" # Default to the larger model
249
  )
250
  html_content = gr.Textbox(
251
+ label="HTML" # Input for raw HTML
252
  )
253
+ submit_btn = gr.Button(value="Submit") # Button to trigger the model
254
 
255
  with gr.Column():
256
+ model_output_text = gr.Textbox(label="Reader LM Output") # Model-generated Markdown
257
+ markdownify_output = gr.Textbox(label="Markdownify Output") # Rule-based Markdown
258
 
259
+ # Add example HTML input for demonstration
260
  gr.Examples(
261
  examples=[
262
  [example_html],
 
264
  inputs=[html_content],
265
  outputs=[model_output_text, markdownify_output],
266
  fn=run_example,
267
+ cache_examples=True, # Cache example outputs
268
  label="Try example HTML"
269
  )
270
 
271
+ # Connect the submit button to the run_example function
272
  submit_btn.click(
273
  fn=run_example,
274
  inputs=[html_content, model_selector],
275
  outputs=[model_output_text, markdownify_output]
276
  )
277
 
278
+ # Finally, launch the combined demo app
279
+ print("Launching the demo...") # Debug log
280
  demo.launch()