vineet124jig commited on
Commit
97a0620
·
verified ·
1 Parent(s): 77fdc1b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +198 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from PIL import Image
4
+ import io
5
+ import os
6
+ BASE_URL = "https://api.jigsawstack.com/v1"
7
+ headers = {"x-api-key": os.getenv("")}
8
+
9
+ # ----------------- JigsawStack API Wrappers ------------------
10
+
11
+ def enhanced_ai_scrape(input_method, url, html, prompts_str, selector, page_pos):
12
+ def error_response(message):
13
+ return (
14
+ message,
15
+ gr.update(visible=False),
16
+ gr.update(visible=False),
17
+ gr.update(visible=False),
18
+ gr.update(visible=False),
19
+ gr.update(visible=False),
20
+ )
21
+
22
+ try:
23
+ # Validate element prompts
24
+ prompts = [p.strip() for p in prompts_str.split(",") if p.strip()]
25
+ if not prompts:
26
+ return error_response("Error: No element prompts provided.")
27
+
28
+ if len(prompts) > 5:
29
+ return error_response("Error: Maximum 5 element prompts allowed.")
30
+
31
+ payload = {
32
+ "element_prompts": prompts,
33
+ "root_element_selector": selector or "main",
34
+ "page_position": int(page_pos) if str(page_pos).strip().isdigit() else 1
35
+ }
36
+
37
+ # Add URL or HTML based on input method
38
+ if input_method == "URL":
39
+ if not url or not url.strip():
40
+ return error_response("Error: URL is required when using URL input method.")
41
+ payload["url"] = url.strip()
42
+ elif input_method == "HTML Content":
43
+ if not html or not html.strip():
44
+ return error_response("Error: HTML content is required when using HTML input method.")
45
+ payload["html"] = html.strip()
46
+
47
+ response = requests.post(f"{BASE_URL}/ai/scrape", headers=headers, json=payload)
48
+ response.raise_for_status()
49
+
50
+ result = response.json()
51
+
52
+ if not result.get("success"):
53
+ return error_response(f"Error: Scraping failed - {result.get('message', 'Unknown error')}")
54
+
55
+ # Extract all the data
56
+ context = result.get("context", {})
57
+ selectors = result.get("selectors", {})
58
+ data = result.get("data", [])
59
+ links = result.get("link", [])
60
+ current_page = result.get("page_position", 1)
61
+ total_pages = result.get("page_position_length", 1)
62
+
63
+ # Format pagination info
64
+ pagination_text = f"Page {current_page} of {total_pages}"
65
+ if total_pages > 1:
66
+ pagination_text += f" (Total pages available: {total_pages})"
67
+
68
+ status_text = f"✅ Successfully scraped {len(data)} data items"
69
+ if context:
70
+ status_text += f" with {len(context)} context elements"
71
+
72
+ return (
73
+ status_text,
74
+ gr.update(value=context, visible=True if context else False),
75
+ gr.update(value=selectors, visible=True if selectors else False),
76
+ gr.update(value=data, visible=True if data else False),
77
+ gr.update(value=links, visible=True if links else False),
78
+ gr.update(value=pagination_text, visible=True),
79
+ )
80
+
81
+ except requests.exceptions.RequestException as req_err:
82
+ return error_response(f"Request failed: {str(req_err)}")
83
+ except Exception as e:
84
+ return error_response(f"Unexpected error: {str(e)}")
85
+
86
+ # ----------------- Gradio UI ------------------
87
+
88
+ with gr.Blocks() as demo:
89
+ gr.Markdown("""
90
+ <div style='text-align: center; margin-bottom: 24px;'>
91
+ <h1 style='font-size:2.2em; margin-bottom: 0.2em;'>🧩 JigsawStack AI Scraper</h1>
92
+ <p style='font-size:1.2em; margin-top: 0;'>Extract structured data from web pages with advanced AI models.</p>
93
+ <p style='font-size:1em; margin-top: 0.5em;'>For more details and API usage, see the <a href='https://jigsawstack.com/docs/api-reference/ai/scrape' target='_blank'>documentation</a>.</p>
94
+ </div>
95
+ """)
96
+
97
+ with gr.Row():
98
+ with gr.Column():
99
+ gr.Markdown("#### Input Method")
100
+ input_method_scraper = gr.Radio(
101
+ choices=["URL", "HTML Content"],
102
+ label="Choose Input Method",
103
+ value="URL"
104
+ )
105
+
106
+ # Conditional inputs based on selection
107
+ url_scraper = gr.Textbox(
108
+ label="Page URL",
109
+ placeholder="https://example.com/pricing",
110
+ info="URL of the page to scrape"
111
+ )
112
+ html_content = gr.Textbox(
113
+ label="HTML Content",
114
+ lines=8,
115
+ placeholder="<html>...</html>",
116
+ visible=False,
117
+ info="Raw HTML content to scrape"
118
+ )
119
+
120
+ gr.Markdown("#### Scraping Configuration")
121
+ element_prompts = gr.Textbox(
122
+ label="Element Prompts (comma-separated)",
123
+ lines=3,
124
+ placeholder="Plan title, Plan price, Features, Button text",
125
+ info="Items to scrape (max 5). E.g., 'Plan price', 'Plan title'"
126
+ )
127
+
128
+ root_selector = gr.Textbox(
129
+ label="Root Element Selector",
130
+ value="main",
131
+ placeholder="main, .container, #content",
132
+ info="CSS selector to limit scraping scope (default: main)"
133
+ )
134
+
135
+ page_position = gr.Number(
136
+ label="Page Position",
137
+ value=1,
138
+ minimum=1,
139
+ info="For pagination, current page number (min: 1)"
140
+ )
141
+
142
+ with gr.Column():
143
+ gr.Markdown("#### Results")
144
+ scrape_status = gr.Textbox(
145
+ label="Status",
146
+ interactive=False,
147
+ placeholder="Ready to scrape..."
148
+ )
149
+
150
+ gr.Markdown("#### Extracted Data")
151
+ context_output = gr.JSON(
152
+ label="Context Data",
153
+ visible=False
154
+ )
155
+ selectors_output = gr.JSON(
156
+ label="CSS Selectors Used",
157
+ visible=False
158
+ )
159
+ detailed_data = gr.JSON(
160
+ label="Detailed Scrape Data",
161
+ visible=False
162
+ )
163
+ links_data = gr.JSON(
164
+ label="Detected Links",
165
+ visible=False
166
+ )
167
+
168
+ gr.Markdown("#### Pagination Info")
169
+ pagination_info = gr.Textbox(
170
+ label="Page Information",
171
+ interactive=False,
172
+ visible=False
173
+ )
174
+
175
+ scrape_btn = gr.Button("Scrape with AI", variant="primary")
176
+
177
+ # Function to show/hide input groups based on selection
178
+ def update_scraper_input_visibility(method):
179
+ if method == "URL":
180
+ return gr.Textbox(visible=True), gr.Textbox(visible=False)
181
+ elif method == "HTML Content":
182
+ return gr.Textbox(visible=False), gr.Textbox(visible=True)
183
+ else:
184
+ return gr.Textbox(visible=True), gr.Textbox(visible=False)
185
+
186
+ input_method_scraper.change(
187
+ update_scraper_input_visibility,
188
+ inputs=input_method_scraper,
189
+ outputs=[url_scraper, html_content]
190
+ )
191
+
192
+ scrape_btn.click(
193
+ enhanced_ai_scrape,
194
+ inputs=[input_method_scraper, url_scraper, html_content, element_prompts, root_selector, page_position],
195
+ outputs=[scrape_status, context_output, selectors_output, detailed_data, links_data, pagination_info]
196
+ )
197
+
198
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ requests
3
+ Pillow