shukdevdatta123 commited on
Commit
f7478ee
Β·
verified Β·
1 Parent(s): 982639c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -13
app.py CHANGED
@@ -18,12 +18,10 @@ class WebScrapingTool:
18
  def __init__(self):
19
  self.client = None
20
  self.system_prompt = """You are a specialized web data extraction assistant. Your core purpose is to browse and analyze the content of web pages based on user instructions, and return structured or unstructured information from the provided URL. Your capabilities include:
21
-
22
  1. Navigating and reading web page content from a given URL.
23
  2. Extracting textual content including headings, paragraphs, lists, and metadata.
24
  3. Identifying and extracting HTML tables and presenting them in a clean, structured format.
25
  4. Creating new, custom tables based on user queries by processing, reorganizing, or filtering the content found on the source page.
26
-
27
  You must always follow these guidelines:
28
  - Accurately extract and summarize both structured (tables, lists) and unstructured (paragraphs, articles) content.
29
  - Clearly separate different types of data (e.g., summaries, tables, bullet points).
@@ -37,11 +35,8 @@ You must always follow these guidelines:
37
  - Include only the relevant columns as per the user request.
38
  - Sort, filter, and reorganize data accordingly.
39
  - Use clear and consistent headers.
40
-
41
  You must not hallucinate or infer data not present on the page. If content is missing, unclear, or restricted, say so explicitly.
42
-
43
  Always respond based on the actual content from the provided link. If the page fails to load or cannot be accessed, inform the user immediately.
44
-
45
  Your role is to act as an intelligent browser and data interpreter β€” able to read and reshape any web content to meet user needs."""
46
 
47
  def setup_client(self, api_key):
@@ -59,11 +54,11 @@ Your role is to act as an intelligent browser and data interpreter β€” able to r
59
  """Create a robust session with retry strategy and proper headers"""
60
  session = requests.Session()
61
 
62
- # Define retry strategy
63
  retry_strategy = Retry(
64
  total=3,
65
  status_forcelist=[429, 500, 502, 503, 504],
66
- method_whitelist=["HEAD", "GET", "OPTIONS"],
67
  backoff_factor=1
68
  )
69
 
@@ -97,6 +92,7 @@ Your role is to act as an intelligent browser and data interpreter β€” able to r
97
 
98
  # Multiple timeout attempts with increasing duration
99
  timeout_attempts = [15, 30, 45]
 
100
 
101
  for timeout in timeout_attempts:
102
  try:
@@ -133,9 +129,16 @@ Your role is to act as an intelligent browser and data interpreter β€” able to r
133
  break
134
  except:
135
  continue
 
 
 
 
 
 
 
136
 
137
  # Check if we got a response
138
- if 'response' not in locals():
139
  return {
140
  'success': False,
141
  'error': "Failed to establish connection after multiple attempts"
@@ -271,15 +274,12 @@ Your role is to act as an intelligent browser and data interpreter β€” able to r
271
  content_text = f"""
272
  WEBPAGE ANALYSIS REQUEST
273
  ========================
274
-
275
  URL: {scraped_data['url']}
276
  Title: {scraped_data['title']}
277
  Content Length: {scraped_data['content_length']} characters
278
  Tables Found: {len(scraped_data['tables'])}
279
-
280
  META DESCRIPTION:
281
  {scraped_data['meta_description']}
282
-
283
  MAIN CONTENT:
284
  {scraped_data['text']}
285
  """
@@ -426,7 +426,7 @@ def create_interface():
426
  - E-commerce product pages
427
  - Financial data sites (Yahoo Finance, MarketWatch)
428
  - Research papers and academic sites
429
-
430
  ## πŸ§ͺ **Test Scenarios**
431
 
432
  ### **1. News & Media Sites**
@@ -577,7 +577,6 @@ def create_interface():
577
  - HttpBin (perfect for testing basic functionality)
578
 
579
  Start with the simpler tests and gradually move to more complex scenarios to fully evaluate your tool's capabilities!
580
-
581
  """)
582
 
583
  # Event handlers
 
18
  def __init__(self):
19
  self.client = None
20
  self.system_prompt = """You are a specialized web data extraction assistant. Your core purpose is to browse and analyze the content of web pages based on user instructions, and return structured or unstructured information from the provided URL. Your capabilities include:
 
21
  1. Navigating and reading web page content from a given URL.
22
  2. Extracting textual content including headings, paragraphs, lists, and metadata.
23
  3. Identifying and extracting HTML tables and presenting them in a clean, structured format.
24
  4. Creating new, custom tables based on user queries by processing, reorganizing, or filtering the content found on the source page.
 
25
  You must always follow these guidelines:
26
  - Accurately extract and summarize both structured (tables, lists) and unstructured (paragraphs, articles) content.
27
  - Clearly separate different types of data (e.g., summaries, tables, bullet points).
 
35
  - Include only the relevant columns as per the user request.
36
  - Sort, filter, and reorganize data accordingly.
37
  - Use clear and consistent headers.
 
38
  You must not hallucinate or infer data not present on the page. If content is missing, unclear, or restricted, say so explicitly.
 
39
  Always respond based on the actual content from the provided link. If the page fails to load or cannot be accessed, inform the user immediately.
 
40
  Your role is to act as an intelligent browser and data interpreter β€” able to read and reshape any web content to meet user needs."""
41
 
42
  def setup_client(self, api_key):
 
54
  """Create a robust session with retry strategy and proper headers"""
55
  session = requests.Session()
56
 
57
+ # Define retry strategy with fixed parameter name
58
  retry_strategy = Retry(
59
  total=3,
60
  status_forcelist=[429, 500, 502, 503, 504],
61
+ allowed_methods=["HEAD", "GET", "OPTIONS"], # Fixed: changed from method_whitelist
62
  backoff_factor=1
63
  )
64
 
 
92
 
93
  # Multiple timeout attempts with increasing duration
94
  timeout_attempts = [15, 30, 45]
95
+ response = None
96
 
97
  for timeout in timeout_attempts:
98
  try:
 
129
  break
130
  except:
131
  continue
132
+ except requests.exceptions.RequestException as e:
133
+ if timeout == timeout_attempts[-1]: # Last attempt
134
+ return {
135
+ 'success': False,
136
+ 'error': f"Request failed: {str(e)}"
137
+ }
138
+ continue
139
 
140
  # Check if we got a response
141
+ if response is None:
142
  return {
143
  'success': False,
144
  'error': "Failed to establish connection after multiple attempts"
 
274
  content_text = f"""
275
  WEBPAGE ANALYSIS REQUEST
276
  ========================
 
277
  URL: {scraped_data['url']}
278
  Title: {scraped_data['title']}
279
  Content Length: {scraped_data['content_length']} characters
280
  Tables Found: {len(scraped_data['tables'])}
 
281
  META DESCRIPTION:
282
  {scraped_data['meta_description']}
 
283
  MAIN CONTENT:
284
  {scraped_data['text']}
285
  """
 
426
  - E-commerce product pages
427
  - Financial data sites (Yahoo Finance, MarketWatch)
428
  - Research papers and academic sites
429
+
430
  ## πŸ§ͺ **Test Scenarios**
431
 
432
  ### **1. News & Media Sites**
 
577
  - HttpBin (perfect for testing basic functionality)
578
 
579
  Start with the simpler tests and gradually move to more complex scenarios to fully evaluate your tool's capabilities!
 
580
  """)
581
 
582
  # Event handlers