Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -18,12 +18,10 @@ class WebScrapingTool:
|
|
18 |
def __init__(self):
|
19 |
self.client = None
|
20 |
self.system_prompt = """You are a specialized web data extraction assistant. Your core purpose is to browse and analyze the content of web pages based on user instructions, and return structured or unstructured information from the provided URL. Your capabilities include:
|
21 |
-
|
22 |
1. Navigating and reading web page content from a given URL.
|
23 |
2. Extracting textual content including headings, paragraphs, lists, and metadata.
|
24 |
3. Identifying and extracting HTML tables and presenting them in a clean, structured format.
|
25 |
4. Creating new, custom tables based on user queries by processing, reorganizing, or filtering the content found on the source page.
|
26 |
-
|
27 |
You must always follow these guidelines:
|
28 |
- Accurately extract and summarize both structured (tables, lists) and unstructured (paragraphs, articles) content.
|
29 |
- Clearly separate different types of data (e.g., summaries, tables, bullet points).
|
@@ -37,11 +35,8 @@ You must always follow these guidelines:
|
|
37 |
- Include only the relevant columns as per the user request.
|
38 |
- Sort, filter, and reorganize data accordingly.
|
39 |
- Use clear and consistent headers.
|
40 |
-
|
41 |
You must not hallucinate or infer data not present on the page. If content is missing, unclear, or restricted, say so explicitly.
|
42 |
-
|
43 |
Always respond based on the actual content from the provided link. If the page fails to load or cannot be accessed, inform the user immediately.
|
44 |
-
|
45 |
Your role is to act as an intelligent browser and data interpreter β able to read and reshape any web content to meet user needs."""
|
46 |
|
47 |
def setup_client(self, api_key):
|
@@ -59,11 +54,11 @@ Your role is to act as an intelligent browser and data interpreter β able to r
|
|
59 |
"""Create a robust session with retry strategy and proper headers"""
|
60 |
session = requests.Session()
|
61 |
|
62 |
-
# Define retry strategy
|
63 |
retry_strategy = Retry(
|
64 |
total=3,
|
65 |
status_forcelist=[429, 500, 502, 503, 504],
|
66 |
-
|
67 |
backoff_factor=1
|
68 |
)
|
69 |
|
@@ -97,6 +92,7 @@ Your role is to act as an intelligent browser and data interpreter β able to r
|
|
97 |
|
98 |
# Multiple timeout attempts with increasing duration
|
99 |
timeout_attempts = [15, 30, 45]
|
|
|
100 |
|
101 |
for timeout in timeout_attempts:
|
102 |
try:
|
@@ -133,9 +129,16 @@ Your role is to act as an intelligent browser and data interpreter β able to r
|
|
133 |
break
|
134 |
except:
|
135 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
# Check if we got a response
|
138 |
-
if
|
139 |
return {
|
140 |
'success': False,
|
141 |
'error': "Failed to establish connection after multiple attempts"
|
@@ -271,15 +274,12 @@ Your role is to act as an intelligent browser and data interpreter β able to r
|
|
271 |
content_text = f"""
|
272 |
WEBPAGE ANALYSIS REQUEST
|
273 |
========================
|
274 |
-
|
275 |
URL: {scraped_data['url']}
|
276 |
Title: {scraped_data['title']}
|
277 |
Content Length: {scraped_data['content_length']} characters
|
278 |
Tables Found: {len(scraped_data['tables'])}
|
279 |
-
|
280 |
META DESCRIPTION:
|
281 |
{scraped_data['meta_description']}
|
282 |
-
|
283 |
MAIN CONTENT:
|
284 |
{scraped_data['text']}
|
285 |
"""
|
@@ -426,7 +426,7 @@ def create_interface():
|
|
426 |
- E-commerce product pages
|
427 |
- Financial data sites (Yahoo Finance, MarketWatch)
|
428 |
- Research papers and academic sites
|
429 |
-
|
430 |
## π§ͺ **Test Scenarios**
|
431 |
|
432 |
### **1. News & Media Sites**
|
@@ -577,7 +577,6 @@ def create_interface():
|
|
577 |
- HttpBin (perfect for testing basic functionality)
|
578 |
|
579 |
Start with the simpler tests and gradually move to more complex scenarios to fully evaluate your tool's capabilities!
|
580 |
-
|
581 |
""")
|
582 |
|
583 |
# Event handlers
|
|
|
18 |
def __init__(self):
|
19 |
self.client = None
|
20 |
self.system_prompt = """You are a specialized web data extraction assistant. Your core purpose is to browse and analyze the content of web pages based on user instructions, and return structured or unstructured information from the provided URL. Your capabilities include:
|
|
|
21 |
1. Navigating and reading web page content from a given URL.
|
22 |
2. Extracting textual content including headings, paragraphs, lists, and metadata.
|
23 |
3. Identifying and extracting HTML tables and presenting them in a clean, structured format.
|
24 |
4. Creating new, custom tables based on user queries by processing, reorganizing, or filtering the content found on the source page.
|
|
|
25 |
You must always follow these guidelines:
|
26 |
- Accurately extract and summarize both structured (tables, lists) and unstructured (paragraphs, articles) content.
|
27 |
- Clearly separate different types of data (e.g., summaries, tables, bullet points).
|
|
|
35 |
- Include only the relevant columns as per the user request.
|
36 |
- Sort, filter, and reorganize data accordingly.
|
37 |
- Use clear and consistent headers.
|
|
|
38 |
You must not hallucinate or infer data not present on the page. If content is missing, unclear, or restricted, say so explicitly.
|
|
|
39 |
Always respond based on the actual content from the provided link. If the page fails to load or cannot be accessed, inform the user immediately.
|
|
|
40 |
Your role is to act as an intelligent browser and data interpreter β able to read and reshape any web content to meet user needs."""
|
41 |
|
42 |
def setup_client(self, api_key):
|
|
|
54 |
"""Create a robust session with retry strategy and proper headers"""
|
55 |
session = requests.Session()
|
56 |
|
57 |
+
# Define retry strategy with fixed parameter name
|
58 |
retry_strategy = Retry(
|
59 |
total=3,
|
60 |
status_forcelist=[429, 500, 502, 503, 504],
|
61 |
+
allowed_methods=["HEAD", "GET", "OPTIONS"], # Fixed: changed from method_whitelist
|
62 |
backoff_factor=1
|
63 |
)
|
64 |
|
|
|
92 |
|
93 |
# Multiple timeout attempts with increasing duration
|
94 |
timeout_attempts = [15, 30, 45]
|
95 |
+
response = None
|
96 |
|
97 |
for timeout in timeout_attempts:
|
98 |
try:
|
|
|
129 |
break
|
130 |
except:
|
131 |
continue
|
132 |
+
except requests.exceptions.RequestException as e:
|
133 |
+
if timeout == timeout_attempts[-1]: # Last attempt
|
134 |
+
return {
|
135 |
+
'success': False,
|
136 |
+
'error': f"Request failed: {str(e)}"
|
137 |
+
}
|
138 |
+
continue
|
139 |
|
140 |
# Check if we got a response
|
141 |
+
if response is None:
|
142 |
return {
|
143 |
'success': False,
|
144 |
'error': "Failed to establish connection after multiple attempts"
|
|
|
274 |
content_text = f"""
|
275 |
WEBPAGE ANALYSIS REQUEST
|
276 |
========================
|
|
|
277 |
URL: {scraped_data['url']}
|
278 |
Title: {scraped_data['title']}
|
279 |
Content Length: {scraped_data['content_length']} characters
|
280 |
Tables Found: {len(scraped_data['tables'])}
|
|
|
281 |
META DESCRIPTION:
|
282 |
{scraped_data['meta_description']}
|
|
|
283 |
MAIN CONTENT:
|
284 |
{scraped_data['text']}
|
285 |
"""
|
|
|
426 |
- E-commerce product pages
|
427 |
- Financial data sites (Yahoo Finance, MarketWatch)
|
428 |
- Research papers and academic sites
|
429 |
+
|
430 |
## π§ͺ **Test Scenarios**
|
431 |
|
432 |
### **1. News & Media Sites**
|
|
|
577 |
- HttpBin (perfect for testing basic functionality)
|
578 |
|
579 |
Start with the simpler tests and gradually move to more complex scenarios to fully evaluate your tool's capabilities!
|
|
|
580 |
""")
|
581 |
|
582 |
# Event handlers
|