Spaces:

Nymbo
/

Markdown-Studio

Paused

App Files Files Community

Nymbo commited on Jan 26

Commit

33d554a

verified ·

1 Parent(s): ee88838

restarting and adding readerlm-1, and markdownify inputs

Browse files

Files changed (1) hide show

app.py +207 -60

app.py CHANGED Viewed

@@ -5,102 +5,249 @@ from markdown.extensions.fenced_code import FencedCodeExtension
 from markdown.extensions.toc import TocExtension
 from markdown.extensions.attr_list import AttrListExtension
 from markdown.extensions.codehilite import CodeHiliteExtension
 from transformers import pipeline
-# Function to render markdown to HTML with extensions
 def render_markdown(md_text):
-    print("[DEBUG] render_markdown called with input:", md_text)  # Debug log for input
-    # Convert the input markdown text to HTML using various extensions for additional functionality
-    rendered_html = markdown.markdown(
         md_text,
         extensions=[
-            TableExtension(),  # Enables rendering of tables in markdown
-            FencedCodeExtension(),  # Supports fenced code blocks
-            TocExtension(baselevel=2),  # Generates a table of contents starting at level 2
-            AttrListExtension(),  # Allows adding attributes to markdown elements
-            CodeHiliteExtension(linenums=False, css_class="highlight"),  # Syntax highlighting for code blocks
         ],
     )
-    print("[DEBUG] Rendered HTML output:", rendered_html)  # Debug log for output
-    return rendered_html
 # Load the JinaAI ReaderLM-v2 model
 model_name = "jinaai/ReaderLM-v2"
-print("[DEBUG] Loading model:", model_name)  # Debug log for model loading
-html_converter = pipeline("text-generation", model=model_name)  # Initialize the text-generation pipeline with the specified model
-# Function to convert HTML to Markdown or JSON
 def convert_html(html_input, output_format):
-    print("[DEBUG] convert_html called with inputs:", html_input, output_format)  # Debug log for inputs
-    # Prepare the prompt for the model, specifying the desired output format (Markdown or JSON)
     prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
-    print("[DEBUG] Generated prompt:", prompt)  # Debug log for the prompt
-    # Use the model to generate the conversion output
-    response = html_converter(prompt, max_length=99999, num_return_sequences=1)
-    print("[DEBUG] Model response:", response)  # Debug log for model response
-    converted_output = response[0]['generated_text']  # Extract the generated text from the model response
-    # Remove the prompt text from the generated output and clean up the result
     converted_output = converted_output.replace(prompt, "").strip()
-    print("[DEBUG] Converted output:", converted_output)  # Debug log for the final output
     return converted_output
-# Creating the Gradio Interface
-with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
-    # Tab for the Markdown live preview feature
     with gr.Tab("Live Preview"):
-        gr.Markdown("# Markdown Suite")  # Header for the tab
         with gr.Row():
             with gr.Column():
-                # Input textbox for entering Markdown text
                 md_input = gr.Textbox(
-                    lines=20,
-                    placeholder="Write your markdown here...",
                     label="Markdown Input",
-                    elem_classes=["gr-textbox"]
                 )
             with gr.Column():
-                # Output area to display the rendered HTML from the Markdown input
-                md_output = gr.HTML(label="Rendered Output", elem_classes=["gr-html"])
-        # Define the interaction: Update the HTML preview whenever the Markdown input changes
-        md_input.change(render_markdown, inputs=md_input, outputs=md_output)
-    # Tab for HTML to Markdown/JSON conversion feature
-    with gr.Tab("HTML to Markdown/JSON"):
-        gr.Markdown("# HTML to Markdown/JSON Converter")  # Header for the tab
         with gr.Row():
-            # Input textbox for raw HTML input
-            html_input = gr.Textbox(
-                lines=10,
-                placeholder="Paste your raw HTML here...",
                 label="Raw HTML Input"
             )
-            # Radio buttons to select the output format (Markdown or JSON)
-            output_format = gr.Radio([
-                "Markdown",
-                "JSON"
-            ], label="Output Format", value="Markdown")
-        # Output textbox to display the converted Markdown or JSON
-        converted_output = gr.Textbox(
-            lines=10,
             label="Converted Output"
         )
-        # Define the interaction: Convert HTML when the "Convert" button is clicked
-        convert_button = gr.Button("Convert")
-        convert_button.click(
-            convert_html,  # Function to handle conversion
-            inputs=[html_input, output_format],  # Inputs: Raw HTML and desired output format
-            outputs=converted_output  # Output: Converted text
         )
-# Launch the app
-print("[DEBUG] Launching the app")  # Debug log for app launch
 demo.launch()

 from markdown.extensions.toc import TocExtension
 from markdown.extensions.attr_list import AttrListExtension
 from markdown.extensions.codehilite import CodeHiliteExtension
+# For ReaderLM-2
 from transformers import pipeline
+# For ReaderLM-1
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import spaces
+import re
+from markdownify import markdownify
+######################################
+# 1) MARKDOWN-STUDIO FUNCTIONALITY   #
+######################################
 def render_markdown(md_text):
+    """
+    Render a string of Markdown text into HTML with a number of useful extensions.
+    """
+    return markdown.markdown(
         md_text,
         extensions=[
+            TableExtension(),
+            FencedCodeExtension(),
+            TocExtension(baselevel=2),
+            AttrListExtension(),
+            CodeHiliteExtension(linenums=False, css_class="highlight"),
         ],
     )
+######################################
+# 2) READERLM-2 FUNCTIONALITY        #
+######################################
 # Load the JinaAI ReaderLM-v2 model
 model_name = "jinaai/ReaderLM-v2"
+html_converter = pipeline("text-generation", model=model_name)
 def convert_html(html_input, output_format):
+    """
+    Use the JinaAI ReaderLM-v2 pipeline to convert HTML into Markdown or JSON.
+    """
     prompt = f"Convert the following HTML into {output_format}:\n\n{html_input}"
+    response = html_converter(prompt, max_length=500, num_return_sequences=1)
+    converted_output = response[0]['generated_text']
+    # Remove the prompt from the start of the generated text, if present
     converted_output = converted_output.replace(prompt, "").strip()
     return converted_output
+######################################
+# 3) READERLM-1 FUNCTIONALITY        #
+######################################
+# Prepare models and tokenizers
+models = {
+    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained(
+        "jinaai/reader-lm-0.5b", trust_remote_code=True
+    ).eval().to("cuda"),
+    "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained(
+        "jinaai/reader-lm-1.5b", trust_remote_code=True
+    ).eval().to("cuda"),
+}
+tokenizers = {
+    "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained(
+        "jinaai/reader-lm-0.5b", trust_remote_code=True
+    ),
+    "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained(
+        "jinaai/reader-lm-1.5b", trust_remote_code=True
+    ),
+}
+@spaces.GPU
+def run_example(html_content, model_id="jinaai/reader-lm-1.5b"):
+    """
+    Use ReaderLM (0.5b or 1.5b) to generate model-based HTML-to-Markdown text,
+    then also provide a rule-based 'markdownify' output.
+    """
+    model = models[model_id]
+    tokenizer = tokenizers[model_id]
+    # Construct the chat-based input
+    messages = [{"role": "user", "content": html_content}]
+    input_text = tokenizer.apply_chat_template(messages, tokenize=False)
+    # Tokenize
+    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
+    # Generate
+    outputs = model.generate(
+        inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
+    )
+    # Extract the model's text from the response
+    pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
+    assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
+    # Also do a rule-based markdownify for comparison
+    markdownify_output = markdownify(html_content)
+    # Return the two results (model-based, rule-based)
+    return assistant_response[0], markdownify_output
+# Example HTML from ReaderLM-1
+example_html = """<div id="myDIV" class="header">
+  <h2>My To Do List</h2>
+  <input type="text" id="myInput" placeholder="Title...">
+  <span onclick="newElement()" class="addBtn">Add</span>
+</div>
+<ul id="myUL">
+  <li>Hit the gym</li>
+  <li class="checked">Pay bills</li>
+  <li>Meet George</li>
+  <li>Buy eggs</li>
+  <li>Read a book</li>
+  <li>Organize office</li>
+</ul>"""
+########################################################
+# Combine everything into a single Gradio Blocks app   #
+########################################################
+# Optional extra CSS for the ReaderLM-1 tab
+css = """
+#output {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+}
+"""
+# We use the Nymbo/Nymbo_Theme from the original Markdown-Studio example
+with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
+    ########################################################
+    # TAB 1: Markdown Suite (live preview)
+    ########################################################
     with gr.Tab("Live Preview"):
+        gr.Markdown("# Markdown Suite")
         with gr.Row():
             with gr.Column():
                 md_input = gr.Textbox(
+                    lines=20,
+                    placeholder="Write your markdown here...",
                     label="Markdown Input",
                 )
             with gr.Column():
+                md_output = gr.HTML(
+                    label="Rendered Output"
+                )
+        md_input.change(fn=render_markdown, inputs=md_input, outputs=md_output)
+    ########################################################
+    # TAB 2: ReaderLM-2 Converter (HTML → Markdown/JSON)
+    ########################################################
+    with gr.Tab("ReaderLM-2 Converter"):
+        gr.Markdown("## HTML to Markdown/JSON Converter (ReaderLM-v2)")
         with gr.Row():
+            html_input_2 = gr.Textbox(
+                lines=10,
+                placeholder="Paste your raw HTML here...",
                 label="Raw HTML Input"
             )
+            output_format_2 = gr.Radio(
+                ["Markdown", "JSON"],
+                label="Output Format",
+                value="Markdown"
+            )
+        convert_btn_2 = gr.Button("Convert")
+        converted_output_2 = gr.Textbox(
+            lines=10,
             label="Converted Output"
         )
+        # Provide usage details
+        gr.Markdown(
+            "Convert raw HTML into formatted Markdown or JSON using **JinaAI ReaderLM-v2**."
+        )
+        # Button event: calls convert_html
+        convert_btn_2.click(
+            fn=convert_html,
+            inputs=[html_input_2, output_format_2],
+            outputs=converted_output_2
+        )
+        # Examples
+        gr.Examples(
+            examples=[
+                ["<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>", "Markdown"],
+                ["<ul><li>Item 1</li><li>Item 2</li></ul>", "JSON"]
+            ],
+            inputs=[html_input_2, output_format_2],
+            outputs=converted_output_2,
+            fn=convert_html,
+            cache_examples=False
+        )
+    ########################################################
+    # TAB 3: ReaderLM-1 HTML-to-Markdown
+    ########################################################
+    with gr.Tab("ReaderLM-1 Converter"):
+        gr.Markdown("""
+        # HTML-to-Markdown with ReaderLM-1
+        Use either **jinaai/reader-lm-0.5b** or **jinaai/reader-lm-1.5b**
+        to convert HTML to Markdown. Compare against rule-based `markdownify`.
+        """)
+        with gr.Row():
+            with gr.Column():
+                model_selector = gr.Dropdown(
+                    choices=list(models.keys()),
+                    label="Model",
+                    value="jinaai/reader-lm-1.5b"
+                )
+                html_content = gr.Textbox(
+                    label="HTML"
+                )
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                model_output_text = gr.Textbox(label="Reader LM Output")
+                markdownify_output = gr.Textbox(label="Markdownify Output")
+        # Example usage
+        gr.Examples(
+            examples=[
+                [example_html],
+            ],
+            inputs=[html_content],
+            outputs=[model_output_text, markdownify_output],
+            fn=run_example,
+            cache_examples=True,
+            label="Try example HTML"
+        )
+        # Button event for custom input
+        submit_btn.click(
+            fn=run_example,
+            inputs=[html_content, model_selector],
+            outputs=[model_output_text, markdownify_output]
         )
+# Finally, launch the combined demo
 demo.launch()