File size: 7,439 Bytes
97a0620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import gradio as gr
import requests
from PIL import Image
import io
import os 
BASE_URL = "https://api.jigsawstack.com/v1"
headers = {"x-api-key": os.getenv("")}

# ----------------- JigsawStack API Wrappers ------------------

def enhanced_ai_scrape(input_method, url, html, prompts_str, selector, page_pos):
    def error_response(message):
        return (
            message,
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=False),
        )
    
    try:
        # Validate element prompts
        prompts = [p.strip() for p in prompts_str.split(",") if p.strip()]
        if not prompts:
            return error_response("Error: No element prompts provided.")
        
        if len(prompts) > 5:
            return error_response("Error: Maximum 5 element prompts allowed.")

        payload = {
            "element_prompts": prompts,
            "root_element_selector": selector or "main",
            "page_position": int(page_pos) if str(page_pos).strip().isdigit() else 1
        }

        # Add URL or HTML based on input method
        if input_method == "URL":
            if not url or not url.strip():
                return error_response("Error: URL is required when using URL input method.")
            payload["url"] = url.strip()
        elif input_method == "HTML Content":
            if not html or not html.strip():
                return error_response("Error: HTML content is required when using HTML input method.")
            payload["html"] = html.strip()

        response = requests.post(f"{BASE_URL}/ai/scrape", headers=headers, json=payload)
        response.raise_for_status()

        result = response.json()

        if not result.get("success"):
            return error_response(f"Error: Scraping failed - {result.get('message', 'Unknown error')}")

        # Extract all the data
        context = result.get("context", {})
        selectors = result.get("selectors", {})
        data = result.get("data", [])
        links = result.get("link", [])
        current_page = result.get("page_position", 1)
        total_pages = result.get("page_position_length", 1)
        
        # Format pagination info
        pagination_text = f"Page {current_page} of {total_pages}"
        if total_pages > 1:
            pagination_text += f" (Total pages available: {total_pages})"
        
        status_text = f"✅ Successfully scraped {len(data)} data items"
        if context:
            status_text += f" with {len(context)} context elements"
        
        return (
            status_text,
            gr.update(value=context, visible=True if context else False),
            gr.update(value=selectors, visible=True if selectors else False),
            gr.update(value=data, visible=True if data else False),
            gr.update(value=links, visible=True if links else False),
            gr.update(value=pagination_text, visible=True),
        )

    except requests.exceptions.RequestException as req_err:
        return error_response(f"Request failed: {str(req_err)}")
    except Exception as e:
        return error_response(f"Unexpected error: {str(e)}")

# ----------------- Gradio UI ------------------

with gr.Blocks() as demo:
    gr.Markdown("""
    <div style='text-align: center; margin-bottom: 24px;'>
        <h1 style='font-size:2.2em; margin-bottom: 0.2em;'>🧩 JigsawStack AI Scraper</h1>
        <p style='font-size:1.2em; margin-top: 0;'>Extract structured data from web pages with advanced AI models.</p>
        <p style='font-size:1em; margin-top: 0.5em;'>For more details and API usage, see the <a href='https://jigsawstack.com/docs/api-reference/ai/scrape' target='_blank'>documentation</a>.</p>
    </div>
    """)

    with gr.Row():
        with gr.Column():
            gr.Markdown("#### Input Method")
            input_method_scraper = gr.Radio(
                choices=["URL", "HTML Content"], 
                label="Choose Input Method", 
                value="URL"
            )
            
            # Conditional inputs based on selection
            url_scraper = gr.Textbox(
                label="Page URL", 
                placeholder="https://example.com/pricing",
                info="URL of the page to scrape"
            )
            html_content = gr.Textbox(
                label="HTML Content", 
                lines=8, 
                placeholder="<html>...</html>",
                visible=False,
                info="Raw HTML content to scrape"
            )
            
            gr.Markdown("#### Scraping Configuration")
            element_prompts = gr.Textbox(
                label="Element Prompts (comma-separated)", 
                lines=3,
                placeholder="Plan title, Plan price, Features, Button text",
                info="Items to scrape (max 5). E.g., 'Plan price', 'Plan title'"
            )
            
            root_selector = gr.Textbox(
                label="Root Element Selector", 
                value="main",
                placeholder="main, .container, #content",
                info="CSS selector to limit scraping scope (default: main)"
            )
            
            page_position = gr.Number(
                label="Page Position", 
                value=1,
                minimum=1,
                info="For pagination, current page number (min: 1)"
            )

        with gr.Column():
            gr.Markdown("#### Results")
            scrape_status = gr.Textbox(
                label="Status", 
                interactive=False,
                placeholder="Ready to scrape..."
            )
            
            gr.Markdown("#### Extracted Data")
            context_output = gr.JSON(
                label="Context Data",
                visible=False
            )
            selectors_output = gr.JSON(
                label="CSS Selectors Used",
                visible=False
            )
            detailed_data = gr.JSON(
                label="Detailed Scrape Data",
                visible=False
            )
            links_data = gr.JSON(
                label="Detected Links",
                visible=False
            )
            
            gr.Markdown("#### Pagination Info")
            pagination_info = gr.Textbox(
                label="Page Information",
                interactive=False,
                visible=False
            )

    scrape_btn = gr.Button("Scrape with AI", variant="primary")
    
    # Function to show/hide input groups based on selection
    def update_scraper_input_visibility(method):
        if method == "URL":
            return gr.Textbox(visible=True), gr.Textbox(visible=False)
        elif method == "HTML Content":
            return gr.Textbox(visible=False), gr.Textbox(visible=True)
        else:
            return gr.Textbox(visible=True), gr.Textbox(visible=False)
    
    input_method_scraper.change(
        update_scraper_input_visibility,
        inputs=input_method_scraper,
        outputs=[url_scraper, html_content]
    )
    
    scrape_btn.click(
        enhanced_ai_scrape,
        inputs=[input_method_scraper, url_scraper, html_content, element_prompts, root_selector, page_position],
        outputs=[scrape_status, context_output, selectors_output, detailed_data, links_data, pagination_info]
    )

demo.launch()