Nymbo commited on
Commit
33d554a
·
verified ·
1 Parent(s): ee88838

restarting and adding readerlm-1, and markdownify inputs

Browse files
Files changed (1) hide show
  1. app.py +207 -60
app.py CHANGED
@@ -5,102 +5,249 @@ from markdown.extensions.fenced_code import FencedCodeExtension
5
  from markdown.extensions.toc import TocExtension
6
  from markdown.extensions.attr_list import AttrListExtension
7
  from markdown.extensions.codehilite import CodeHiliteExtension
 
 
8
  from transformers import pipeline
9
 
10
- # Function to render markdown to HTML with extensions
 
 
 
 
 
 
 
 
11
  def render_markdown(md_text):
12
- print("[DEBUG] render_markdown called with input:", md_text) # Debug log for input
13
- # Convert the input markdown text to HTML using various extensions for additional functionality
14
- rendered_html = markdown.markdown(
 
15
  md_text,
16
  extensions=[
17
- TableExtension(), # Enables rendering of tables in markdown
18
- FencedCodeExtension(), # Supports fenced code blocks
19
- TocExtension(baselevel=2), # Generates a table of contents starting at level 2
20
- AttrListExtension(), # Allows adding attributes to markdown elements
21
- CodeHiliteExtension(linenums=False, css_class="highlight"), # Syntax highlighting for code blocks
22
  ],
23
  )
24
- print("[DEBUG] Rendered HTML output:", rendered_html) # Debug log for output
25
- return rendered_html
26
 
 
 
 
27
  # Load the JinaAI ReaderLM-v2 model
28
  model_name = "jinaai/ReaderLM-v2"
29
- print("[DEBUG] Loading model:", model_name) # Debug log for model loading
30
- html_converter = pipeline("text-generation", model=model_name) # Initialize the text-generation pipeline with the specified model
31
 
32
- # Function to convert HTML to Markdown or JSON
33
  def convert_html(html_input, output_format):
34
- print("[DEBUG] convert_html called with inputs:", html_input, output_format) # Debug log for inputs
35
- # Prepare the prompt for the model, specifying the desired output format (Markdown or JSON)
 
36
  prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
37
- print("[DEBUG] Generated prompt:", prompt) # Debug log for the prompt
38
-
39
- # Use the model to generate the conversion output
40
- response = html_converter(prompt, max_length=99999, num_return_sequences=1)
41
- print("[DEBUG] Model response:", response) # Debug log for model response
42
- converted_output = response[0]['generated_text'] # Extract the generated text from the model response
43
 
44
- # Remove the prompt text from the generated output and clean up the result
45
  converted_output = converted_output.replace(prompt, "").strip()
46
- print("[DEBUG] Converted output:", converted_output) # Debug log for the final output
47
  return converted_output
48
 
49
- # Creating the Gradio Interface
50
- with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Tab for the Markdown live preview feature
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  with gr.Tab("Live Preview"):
54
- gr.Markdown("# Markdown Suite") # Header for the tab
55
 
56
  with gr.Row():
57
  with gr.Column():
58
- # Input textbox for entering Markdown text
59
  md_input = gr.Textbox(
60
- lines=20,
61
- placeholder="Write your markdown here...",
62
  label="Markdown Input",
63
- elem_classes=["gr-textbox"]
64
  )
65
  with gr.Column():
66
- # Output area to display the rendered HTML from the Markdown input
67
- md_output = gr.HTML(label="Rendered Output", elem_classes=["gr-html"])
 
68
 
69
- # Define the interaction: Update the HTML preview whenever the Markdown input changes
70
- md_input.change(render_markdown, inputs=md_input, outputs=md_output)
71
 
72
- # Tab for HTML to Markdown/JSON conversion feature
73
- with gr.Tab("HTML to Markdown/JSON"):
74
- gr.Markdown("# HTML to Markdown/JSON Converter") # Header for the tab
 
 
75
 
76
  with gr.Row():
77
- # Input textbox for raw HTML input
78
- html_input = gr.Textbox(
79
- lines=10,
80
- placeholder="Paste your raw HTML here...",
81
  label="Raw HTML Input"
82
  )
 
 
 
 
 
83
 
84
- # Radio buttons to select the output format (Markdown or JSON)
85
- output_format = gr.Radio([
86
- "Markdown",
87
- "JSON"
88
- ], label="Output Format", value="Markdown")
89
-
90
- # Output textbox to display the converted Markdown or JSON
91
- converted_output = gr.Textbox(
92
- lines=10,
93
  label="Converted Output"
94
  )
95
 
96
- # Define the interaction: Convert HTML when the "Convert" button is clicked
97
- convert_button = gr.Button("Convert")
98
- convert_button.click(
99
- convert_html, # Function to handle conversion
100
- inputs=[html_input, output_format], # Inputs: Raw HTML and desired output format
101
- outputs=converted_output # Output: Converted text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  )
103
 
104
- # Launch the app
105
- print("[DEBUG] Launching the app") # Debug log for app launch
106
  demo.launch()
 
5
  from markdown.extensions.toc import TocExtension
6
  from markdown.extensions.attr_list import AttrListExtension
7
  from markdown.extensions.codehilite import CodeHiliteExtension
8
+
9
+ # For ReaderLM-2
10
  from transformers import pipeline
11
 
12
+ # For ReaderLM-1
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM
14
+ import spaces
15
+ import re
16
+ from markdownify import markdownify
17
+
18
+ ######################################
19
+ # 1) MARKDOWN-STUDIO FUNCTIONALITY #
20
+ ######################################
21
  def render_markdown(md_text):
22
+ """
23
+ Render a string of Markdown text into HTML with a number of useful extensions.
24
+ """
25
+ return markdown.markdown(
26
  md_text,
27
  extensions=[
28
+ TableExtension(),
29
+ FencedCodeExtension(),
30
+ TocExtension(baselevel=2),
31
+ AttrListExtension(),
32
+ CodeHiliteExtension(linenums=False, css_class="highlight"),
33
  ],
34
  )
 
 
35
 
36
+ ######################################
37
+ # 2) READERLM-2 FUNCTIONALITY #
38
+ ######################################
39
  # Load the JinaAI ReaderLM-v2 model
40
  model_name = "jinaai/ReaderLM-v2"
41
+ html_converter = pipeline("text-generation", model=model_name)
 
42
 
 
43
  def convert_html(html_input, output_format):
44
+ """
45
+ Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
46
+ """
47
  prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
48
+ response = html_converter(prompt, max_length=500, num_return_sequences=1)
49
+ converted_output = response[0]['generated_text']
 
 
 
 
50
 
51
+ # Remove the prompt from the start of the generated text, if present
52
  converted_output = converted_output.replace(prompt, "").strip()
 
53
  return converted_output
54
 
55
+ ######################################
56
+ # 3) READERLM-1 FUNCTIONALITY #
57
+ ######################################
58
+ # Prepare models and tokenizers
59
+ models = {
60
+ "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
61
+ "jinaai/reader-lm-0.5b", trust_remote_code=True
62
+ ).eval().to("cuda"),
63
+ "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
64
+ "jinaai/reader-lm-1.5b", trust_remote_code=True
65
+ ).eval().to("cuda"),
66
+ }
67
+ tokenizers = {
68
+ "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
69
+ "jinaai/reader-lm-0.5b", trust_remote_code=True
70
+ ),
71
+ "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
72
+ "jinaai/reader-lm-1.5b", trust_remote_code=True
73
+ ),
74
+ }
75
+
76
+ @spaces.GPU
77
+ def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
78
+ """
79
+ Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
80
+ then also provide a rule-based 'markdownify' output.
81
+ """
82
+ model = models[model_id]
83
+ tokenizer = tokenizers[model_id]
84
+
85
+ # Construct the chat-based input
86
+ messages = [{"role": "user", "content": html_content}]
87
+ input_text = tokenizer.apply_chat_template(messages, tokenize=False)
88
+
89
+ # Tokenize
90
+ inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
91
+
92
+ # Generate
93
+ outputs = model.generate(
94
+ inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
95
+ )
96
+
97
+ # Extract the model's text from the response
98
+ pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
99
+ assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
100
 
101
+ # Also do a rule-based markdownify for comparison
102
+ markdownify_output = markdownify(html_content)
103
+
104
+ # Return the two results (model-based, rule-based)
105
+ return assistant_response[0], markdownify_output
106
+
107
+ # Example HTML from ReaderLM-1
108
+ example_html = """<div id="myDIV" class="header">
109
+ <h2>My To Do List</h2>
110
+ <input type="text" id="myInput" placeholder="Title...">
111
+ <span onclick="newElement()" class="addBtn">Add</span>
112
+ </div>
113
+
114
+ <ul id="myUL">
115
+ <li>Hit the gym</li>
116
+ <li class="checked">Pay bills</li>
117
+ <li>Meet George</li>
118
+ <li>Buy eggs</li>
119
+ <li>Read a book</li>
120
+ <li>Organize office</li>
121
+ </ul>"""
122
+
123
+ ########################################################
124
+ # Combine everything into a single Gradio Blocks app #
125
+ ########################################################
126
+
127
+ # Optional extra CSS for the ReaderLM-1 tab
128
+ css = """
129
+ #output {
130
+ height: 500px;
131
+ overflow: auto;
132
+ border: 1px solid #ccc;
133
+ }
134
+ """
135
+
136
+ # We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
137
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
138
+
139
+ ########################################################
140
+ # TAB 1: Markdown Suite (live preview)
141
+ ########################################################
142
  with gr.Tab("Live Preview"):
143
+ gr.Markdown("# Markdown Suite")
144
 
145
  with gr.Row():
146
  with gr.Column():
 
147
  md_input = gr.Textbox(
148
+ lines=20,
149
+ placeholder="Write your markdown here...",
150
  label="Markdown Input",
 
151
  )
152
  with gr.Column():
153
+ md_output = gr.HTML(
154
+ label="Rendered Output"
155
+ )
156
 
157
+ md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
 
158
 
159
+ ########################################################
160
+ # TAB 2: ReaderLM-2 Converter (HTML Markdown/JSON)
161
+ ########################################################
162
+ with gr.Tab("ReaderLM-2 Converter"):
163
+ gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
164
 
165
  with gr.Row():
166
+ html_input_2 = gr.Textbox(
167
+ lines=10,
168
+ placeholder="Paste your raw HTML here...",
 
169
  label="Raw HTML Input"
170
  )
171
+ output_format_2 = gr.Radio(
172
+ ["Markdown", "JSON"],
173
+ label="Output Format",
174
+ value="Markdown"
175
+ )
176
 
177
+ convert_btn_2 = gr.Button("Convert")
178
+ converted_output_2 = gr.Textbox(
179
+ lines=10,
 
 
 
 
 
 
180
  label="Converted Output"
181
  )
182
 
183
+ # Provide usage details
184
+ gr.Markdown(
185
+ "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
186
+ )
187
+
188
+ # Button event: calls convert_html
189
+ convert_btn_2.click(
190
+ fn=convert_html,
191
+ inputs=[html_input_2, output_format_2],
192
+ outputs=converted_output_2
193
+ )
194
+
195
+ # Examples
196
+ gr.Examples(
197
+ examples=[
198
+ ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
199
+ ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
200
+ ],
201
+ inputs=[html_input_2, output_format_2],
202
+ outputs=converted_output_2,
203
+ fn=convert_html,
204
+ cache_examples=False
205
+ )
206
+
207
+ ########################################################
208
+ # TAB 3: ReaderLM-1 HTML-to-Markdown
209
+ ########################################################
210
+ with gr.Tab("ReaderLM-1 Converter"):
211
+ gr.Markdown("""
212
+ # HTML-to-Markdown with ReaderLM-1
213
+ Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b**
214
+ to convert HTML to Markdown. Compare against rule-based `markdownify`.
215
+ """)
216
+
217
+ with gr.Row():
218
+ with gr.Column():
219
+ model_selector = gr.Dropdown(
220
+ choices=list(models.keys()),
221
+ label="Model",
222
+ value="jinaai/reader-lm-1.5b"
223
+ )
224
+ html_content = gr.Textbox(
225
+ label="HTML"
226
+ )
227
+ submit_btn = gr.Button(value="Submit")
228
+
229
+ with gr.Column():
230
+ model_output_text = gr.Textbox(label="Reader LM Output")
231
+ markdownify_output = gr.Textbox(label="Markdownify Output")
232
+
233
+ # Example usage
234
+ gr.Examples(
235
+ examples=[
236
+ [example_html],
237
+ ],
238
+ inputs=[html_content],
239
+ outputs=[model_output_text, markdownify_output],
240
+ fn=run_example,
241
+ cache_examples=True,
242
+ label="Try example HTML"
243
+ )
244
+
245
+ # Button event for custom input
246
+ submit_btn.click(
247
+ fn=run_example,
248
+ inputs=[html_content, model_selector],
249
+ outputs=[model_output_text, markdownify_output]
250
  )
251
 
252
+ # Finally, launch the combined demo
 
253
  demo.launch()