Spaces:

Nymbo
/

Markdown-Studio

Paused

App Files Files Community

Nymbo commited on 11 days ago

Commit

36709a2

verified ·

1 Parent(s): 52d93e9

debug logs and adding system prompt override textbox for ReaderLM-2

Browse files

Files changed (1) hide show

app.py +87 -63

app.py CHANGED Viewed

@@ -6,10 +6,10 @@ from markdown.extensions.toc import TocExtension
 from markdown.extensions.attr_list import AttrListExtension
 from markdown.extensions.codehilite import CodeHiliteExtension
-# For ReaderLM-2
 from transformers import pipeline
-# For ReaderLM-1
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
 import re
@@ -20,16 +20,18 @@ from markdownify import markdownify
 ######################################
 def render_markdown(md_text):
     """
-    Render a string of Markdown text into HTML with a number of useful extensions.
     """
     return markdown.markdown(
         md_text,
         extensions=[
-            TableExtension(),
-            FencedCodeExtension(),
-            TocExtension(baselevel=2),
-            AttrListExtension(),
-            CodeHiliteExtension(linenums=False, css_class="highlight"),
         ],
     )
@@ -38,34 +40,44 @@ def render_markdown(md_text):
 ######################################
 # Load the JinaAI ReaderLM-v2 model
 model_name = "jinaai/ReaderLM-v2"
 html_converter = pipeline("text-generation", model=model_name)
-def convert_html(html_input, output_format):
     """
     Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
     """
-    # ReaderLM-2 System Prompt
-    prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
     response = html_converter(prompt, max_length=9999, num_return_sequences=1)
     converted_output = response[0]['generated_text']
-    # Remove the prompt from the start of the generated text, if present
     converted_output = converted_output.replace(prompt, "").strip()
     return converted_output
 ######################################
 # 3) READERLM-1 FUNCTIONALITY        #
 ######################################
-# Prepare models and tokenizers
 models = {
     "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
         "jinaai/reader-lm-0.5b", trust_remote_code=True
-    ).eval().to("cuda"),
     "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
         "jinaai/reader-lm-1.5b", trust_remote_code=True
-    ).eval().to("cuda"),
 }
 tokenizers = {
     "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
@@ -79,35 +91,39 @@ tokenizers = {
 @spaces.GPU
 def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
     """
-    Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
-    then also provide a rule-based 'markdownify' output.
     """
-    model = models[model_id]
-    tokenizer = tokenizers[model_id]
-    # Construct the chat-based input
     messages = [{"role": "user", "content": html_content}]
-    input_text = tokenizer.apply_chat_template(messages, tokenize=False)
-    # Tokenize
     inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
-    # Generate
     outputs = model.generate(
         inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
     )
-    # Extract the model's text from the response
     pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
     assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
-    # Also do a rule-based markdownify for comparison
     markdownify_output = markdownify(html_content)
-    # Return the two results (model-based, rule-based)
     return assistant_response[0], markdownify_output
-# Example HTML from ReaderLM-1
 example_html = """<div id="myDIV" class="header">
   <h2>My To Do List</h2>
   <input type="text" id="myInput" placeholder="Title...">
@@ -127,84 +143,91 @@ example_html = """<div id="myDIV" class="header">
 # Combine everything into a single Gradio Blocks app   #
 ########################################################
-# Optional extra CSS for the ReaderLM-1 tab
 css = """
 #output {
-    height: 500px;
-    overflow: auto;
-    border: 1px solid #ccc;
 }
 """
-# We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
 with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
     ########################################################
     # TAB 1: Markdown Suite (live preview)
     ########################################################
     with gr.Tab("Live Preview"):
-        gr.Markdown("# Markdown Suite")
         with gr.Row():
             with gr.Column():
                 md_input = gr.Textbox(
                     lines=20,
                     placeholder="Write your markdown here...",
-                    label="Markdown Input",
                 )
             with gr.Column():
                 md_output = gr.HTML(
-                    label="Rendered Output"
                 )
         md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
     ########################################################
     # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
     ########################################################
     with gr.Tab("ReaderLM-2 Converter"):
-        gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
         with gr.Row():
             html_input_2 = gr.Textbox(
                 lines=10,
                 placeholder="Paste your raw HTML here...",
-                label="Raw HTML Input"
             )
             output_format_2 = gr.Radio(
-                ["Markdown", "JSON"],
                 label="Output Format",
-                value="Markdown"
             )
-        convert_btn_2 = gr.Button("Convert")
         converted_output_2 = gr.Textbox(
             lines=20,
-            label="Converted Output"
         )
-        # Provide usage details
         gr.Markdown(
             "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
         )
-        # Button event: calls convert_html
         convert_btn_2.click(
             fn=convert_html,
-            inputs=[html_input_2, output_format_2],
             outputs=converted_output_2
         )
-        # Examples
         gr.Examples(
             examples=[
-                ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
-                ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
             ],
-            inputs=[html_input_2, output_format_2],
             outputs=converted_output_2,
             fn=convert_html,
-            cache_examples=False
         )
     ########################################################
@@ -220,20 +243,20 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
         with gr.Row():
             with gr.Column():
                 model_selector = gr.Dropdown(
-                    choices=list(models.keys()),
                     label="Model",
-                    value="jinaai/reader-lm-1.5b"
                 )
                 html_content = gr.Textbox(
-                    label="HTML"
                 )
-                submit_btn = gr.Button(value="Submit")
             with gr.Column():
-                model_output_text = gr.Textbox(label="Reader LM Output")
-                markdownify_output = gr.Textbox(label="Markdownify Output")
-        # Example usage
         gr.Examples(
             examples=[
                 [example_html],
@@ -241,16 +264,17 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
             inputs=[html_content],
             outputs=[model_output_text, markdownify_output],
             fn=run_example,
-            cache_examples=True,
             label="Try example HTML"
         )
-        # Button event for custom input
         submit_btn.click(
             fn=run_example,
             inputs=[html_content, model_selector],
             outputs=[model_output_text, markdownify_output]
         )
-# Finally, launch the combined demo
 demo.launch()

 from markdown.extensions.attr_list import AttrListExtension
 from markdown.extensions.codehilite import CodeHiliteExtension
+# For ReaderLM-2 functionality
 from transformers import pipeline
+# For ReaderLM-1 functionality
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
 import re
 ######################################
 def render_markdown(md_text):
     """
+    Render a string of Markdown text into HTML using various Markdown extensions.
+    - Supports tables, fenced code blocks, TOC, attribute lists, and syntax highlighting.
     """
+    print("Rendering markdown input to HTML...")  # Debug log
     return markdown.markdown(
         md_text,
         extensions=[
+            TableExtension(),  # Adds support for Markdown tables
+            FencedCodeExtension(),  # Allows for fenced code blocks
+            TocExtension(baselevel=2),  # Generates a Table of Contents starting at level 2
+            AttrListExtension(),  # Enables attribute lists for elements
+            CodeHiliteExtension(linenums=False, css_class="highlight"),  # Syntax highlighting for code blocks
         ],
     )
 ######################################
 # Load the JinaAI ReaderLM-v2 model
 model_name = "jinaai/ReaderLM-v2"
+print(f"Loading model: {model_name}...")  # Debug log
 html_converter = pipeline("text-generation", model=model_name)
+def convert_html(html_input, output_format, custom_prompt=None):
     """
     Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
+    - Takes raw HTML as input and converts it to the specified output format.
+    - Allows for a custom system prompt.
     """
+    if custom_prompt:
+        prompt = f"{custom_prompt}\n\n{html_input}"
+    else:
+        prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
+    print(f"Converting HTML to {output_format} with prompt: {custom_prompt or 'default'}...")  # Debug log
+    print(f"HTML input: {html_input[:100]}...")  # Debug log, preview first 100 characters of input
+    # Use the pipeline to generate the conversion
     response = html_converter(prompt, max_length=9999, num_return_sequences=1)
     converted_output = response[0]['generated_text']
+    # Remove the prompt from the output to clean up the response
     converted_output = converted_output.replace(prompt, "").strip()
+    print("Conversion completed.")  # Debug log
     return converted_output
 ######################################
 # 3) READERLM-1 FUNCTIONALITY        #
 ######################################
+# Prepare models and tokenizers for ReaderLM-1
+print("Loading ReaderLM-1 models and tokenizers...")  # Debug log
 models = {
     "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
         "jinaai/reader-lm-0.5b", trust_remote_code=True
+    ).eval().to("cuda"),  # Load the smaller 0.5b model onto the GPU
     "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
         "jinaai/reader-lm-1.5b", trust_remote_code=True
+    ).eval().to("cuda"),  # Load the larger 1.5b model onto the GPU
 }
 tokenizers = {
     "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
 @spaces.GPU
 def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
     """
+    Generate Markdown from HTML using ReaderLM (0.5b or 1.5b) models.
+    - Includes both model-based generation and a rule-based markdownify output.
     """
+    print(f"Running example with model: {model_id}...")  # Debug log
+    model = models[model_id]  # Select the model based on the input ID
+    tokenizer = tokenizers[model_id]  # Retrieve the corresponding tokenizer
+    # Construct the chat-based input for the model
     messages = [{"role": "user", "content": html_content}]
+    input_text = tokenizer.apply_chat_template(messages, tokenize=False)  # Format input text for the model
+    print(f"Generated input text for model: {input_text[:100]}...")  # Debug log, preview input text
+    # Tokenize the input text
     inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
+    # Generate output using the model
     outputs = model.generate(
         inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
     )
+    # Extract the assistant's response from the generated output
     pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
     assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
+    print("Model generation completed.")  # Debug log
+    # Use markdownify as a rule-based fallback for comparison
     markdownify_output = markdownify(html_content)
+    print("Rule-based markdownify output generated.")  # Debug log
+    # Return both model-based and rule-based outputs
     return assistant_response[0], markdownify_output
+# Example HTML for ReaderLM-1
 example_html = """<div id="myDIV" class="header">
   <h2>My To Do List</h2>
   <input type="text" id="myInput" placeholder="Title...">
 # Combine everything into a single Gradio Blocks app   #
 ########################################################
+# Optional extra CSS for styling the ReaderLM-1 tab
 css = """
 #output {
+    height: 500px;  # Set the height of the output box
+    overflow: auto;  # Enable scrolling for large content
+    border: 1px solid #ccc;  # Add a border around the box
 }
 """
+# Initialize the Gradio app with the Nymbo/Nymbo_Theme for styling
+print("Initializing Gradio app...")  # Debug log
 with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
     ########################################################
     # TAB 1: Markdown Suite (live preview)
     ########################################################
     with gr.Tab("Live Preview"):
+        gr.Markdown("# Markdown Suite")  # Add a title for the tab
         with gr.Row():
             with gr.Column():
                 md_input = gr.Textbox(
                     lines=20,
                     placeholder="Write your markdown here...",
+                    label="Markdown Input",  # Input for Markdown text
                 )
             with gr.Column():
                 md_output = gr.HTML(
+                    label="Rendered Output"  # Display the rendered HTML output
                 )
+        # Update the output whenever the input changes
         md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
     ########################################################
     # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
     ########################################################
     with gr.Tab("ReaderLM-2 Converter"):
+        gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")  # Tab description
         with gr.Row():
             html_input_2 = gr.Textbox(
                 lines=10,
                 placeholder="Paste your raw HTML here...",
+                label="Raw HTML Input"  # Input for raw HTML
             )
             output_format_2 = gr.Radio(
+                ["Markdown", "JSON"],  # Choose the output format
                 label="Output Format",
+                value="Markdown"  # Default to Markdown output
+            )
+            custom_prompt_2 = gr.Textbox(
+                lines=2,
+                placeholder="Optional: Enter a custom prompt...",
+                label="Custom System Prompt"
             )
+        convert_btn_2 = gr.Button("Convert")  # Button to trigger conversion
         converted_output_2 = gr.Textbox(
             lines=20,
+            label="Converted Output"  # Display the converted output
         )
+        # Provide usage details for the converter
         gr.Markdown(
             "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
         )
+        # Connect the button click event to the conversion function
         convert_btn_2.click(
             fn=convert_html,
+            inputs=[html_input_2, output_format_2, custom_prompt_2],
             outputs=converted_output_2
         )
+        # Add example inputs for demonstration
         gr.Examples(
             examples=[
+                ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown", "Optional custom prompt"],
+                ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON", "Optional custom prompt"]
             ],
+            inputs=[html_input_2, output_format_2, custom_prompt_2],
             outputs=converted_output_2,
             fn=convert_html,
+            cache_examples=False  # Disable caching for dynamic examples
         )
     ########################################################
         with gr.Row():
             with gr.Column():
                 model_selector = gr.Dropdown(
+                    choices=list(models.keys()),  # Allow selection between the two models
                     label="Model",
+                    value="jinaai/reader-lm-1.5b"  # Default to the larger model
                 )
                 html_content = gr.Textbox(
+                    label="HTML"  # Input for raw HTML
                 )
+                submit_btn = gr.Button(value="Submit")  # Button to trigger the model
             with gr.Column():
+                model_output_text = gr.Textbox(label="Reader LM Output")  # Model-generated Markdown
+                markdownify_output = gr.Textbox(label="Markdownify Output")  # Rule-based Markdown
+        # Add example HTML input for demonstration
         gr.Examples(
             examples=[
                 [example_html],
             inputs=[html_content],
             outputs=[model_output_text, markdownify_output],
             fn=run_example,
+            cache_examples=True,  # Cache example outputs
             label="Try example HTML"
         )
+        # Connect the submit button to the run_example function
         submit_btn.click(
             fn=run_example,
             inputs=[html_content, model_selector],
             outputs=[model_output_text, markdownify_output]
         )
+# Finally, launch the combined demo app
+print("Launching the demo...")  # Debug log
 demo.launch()