File size: 3,510 Bytes
6d13e5a
 
 
 
 
 
 
 
 
99ab207
6d13e5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Install Dependencies\n",
        "!pip install gradio>=3.0 transformers>=4.25.0 torch>=1.12.0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Import Libraries\n",
        "import gradio as gr\n",
        "from transformers import pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Load the JinaAI ReaderLM-v2 Model\n",
        "model_name = \"jinaai/ReaderLM-v2\"\n",
        "html_converter = pipeline(\"text-generation\", model=model_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Function to Convert HTML to Markdown or JSON\n",
        "def convert_html(html_input, output_format):\n",
        "    # Prepare the prompt for the model\n",
        "    prompt = f\"Convert the following HTML into {output_format}:\\n\\n{html_input}\"\n",
        "    \n",
        "    # Generate the output using the model\n",
        "    response = html_converter(prompt, max_length=500, num_return_sequences=1)\n",
        "    converted_output = response[0]['generated_text']\n",
        "    \n",
        "    # Extract the relevant part of the output (remove the prompt)\n",
        "    converted_output = converted_output.replace(prompt, \"\").strip()\n",
        "    return converted_output"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Create the Gradio Interface\n",
        "iface = gr.Interface(\n",
        "    fn=convert_html,  # Function to call\n",
        "    inputs=[\n",
        "        gr.Textbox(lines=10, placeholder=\"Paste your raw HTML here...\", label=\"Raw HTML Input\"),\n",
        "        gr.Radio([\"Markdown\", \"JSON\"], label=\"Output Format\", value=\"Markdown\")\n",
        "    ],\n",
        "    outputs=gr.Textbox(lines=10, label=\"Converted Output\"),\n",
        "    title=\"HTML to Markdown/JSON Converter\",\n",
        "    description=\"Convert raw HTML into beautifully formatted Markdown or JSON using JinaAI ReaderLM-v2.\",\n",
        "    theme=\"NoCrypt/miku\",  # Apply the NoCrypt/miku theme\n",
        "    examples=[\n",
        "        [\"<h1>Hello World</h1><p>This is a <strong>test</strong>.</p>\", \"Markdown\"],\n",
        "        [\"<ul><li>Item 1</li><li>Item 2</li></ul>\", \"JSON\"]\n",
        "    ]\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Launch the Interface\n",
        "iface.launch(inline=True)  # Embed the interface in the notebook"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}