Pamudu13 commited on
Commit
efa1440
·
verified ·
1 Parent(s): fb6ea6a

Update web_scraper.py

Browse files
Files changed (1) hide show
  1. web_scraper.py +14 -10
web_scraper.py CHANGED
@@ -11,8 +11,12 @@ from io import BytesIO
11
  from googlesearch import search
12
  import json
13
  import logging
 
14
 
15
- app = Flask(__name__)
 
 
 
16
 
17
  # Create a queue for log messages
18
  log_queue = queue.Queue()
@@ -23,7 +27,7 @@ class QueueHandler(logging.Handler):
23
  log_entry = self.format(record)
24
  log_queue.put(log_entry)
25
 
26
-
27
  logger = logging.getLogger()
28
  queue_handler = QueueHandler()
29
  queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
@@ -31,6 +35,11 @@ queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler
31
  logger.addHandler(queue_handler)
32
  logger.setLevel(logging.INFO)
33
 
 
 
 
 
 
34
  def search_images(query, num_images=5):
35
  # Headers to mimic a browser request
36
  headers = {
@@ -169,7 +178,7 @@ def scrape_site_content(query, num_sites=5):
169
  for attempt in range(retries):
170
  try:
171
  # Get the HTML content
172
- print(f"Trying {url} (attempt {attempt + 1}/{retries})")
173
  logger.info(f"Scraping URL: {url}")
174
  response = requests.get(
175
  url,
@@ -182,7 +191,7 @@ def scrape_site_content(query, num_sites=5):
182
  # Verify it's HTML content
183
  content_type = response.headers.get('Content-Type', '').lower()
184
  if 'text/html' not in content_type:
185
- print(f"Skipping {url} - not HTML content")
186
  break
187
 
188
  # Parse the HTML content
@@ -197,7 +206,7 @@ def scrape_site_content(query, num_sites=5):
197
 
198
  # Skip if not enough content
199
  if len(text_content.split()) < 100: # Skip if less than 100 words
200
- print(f"Skipping {url} - not enough content")
201
  break
202
 
203
  # Extract all links (limit to first 10)
@@ -301,21 +310,16 @@ def analyze_with_gpt(scraped_content, research_query, openrouter_key):
301
 
302
  # Prepare the prompt
303
  prompt = f"""You are a research assistant analyzing web content to provide comprehensive research.
304
-
305
  Research Query: {research_query}
306
-
307
  Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
308
  Make sure to cite sources when making specific claims.
309
-
310
  Scraped Content:
311
  {json.dumps(scraped_content, indent=2)}
312
-
313
  Please provide:
314
  1. A comprehensive analysis of the topic
315
  2. Key findings and insights
316
  3. Supporting evidence from the sources
317
  4. Any additional considerations or caveats
318
-
319
  Format your response in markdown with proper headings and citations."""
320
 
321
  response = requests.post(
 
11
  from googlesearch import search
12
  import json
13
  import logging
14
+ import queue
15
 
16
+ # Create a logging filter to suppress socket warnings
17
+ class SocketWarningFilter(logging.Filter):
18
+ def filter(self, record):
19
+ return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
20
 
21
  # Create a queue for log messages
22
  log_queue = queue.Queue()
 
27
  log_entry = self.format(record)
28
  log_queue.put(log_entry)
29
 
30
+ # Set up logging with the custom handler
31
  logger = logging.getLogger()
32
  queue_handler = QueueHandler()
33
  queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
 
35
  logger.addHandler(queue_handler)
36
  logger.setLevel(logging.INFO)
37
 
38
+ # Also add the filter to the root logger to catch all socket warnings
39
+ logging.getLogger().addFilter(SocketWarningFilter())
40
+
41
+ app = Flask(__name__)
42
+
43
  def search_images(query, num_images=5):
44
  # Headers to mimic a browser request
45
  headers = {
 
178
  for attempt in range(retries):
179
  try:
180
  # Get the HTML content
181
+ logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
182
  logger.info(f"Scraping URL: {url}")
183
  response = requests.get(
184
  url,
 
191
  # Verify it's HTML content
192
  content_type = response.headers.get('Content-Type', '').lower()
193
  if 'text/html' not in content_type:
194
+ logger.info(f"Skipping {url} - not HTML content")
195
  break
196
 
197
  # Parse the HTML content
 
206
 
207
  # Skip if not enough content
208
  if len(text_content.split()) < 100: # Skip if less than 100 words
209
+ logger.info(f"Skipping {url} - not enough content")
210
  break
211
 
212
  # Extract all links (limit to first 10)
 
310
 
311
  # Prepare the prompt
312
  prompt = f"""You are a research assistant analyzing web content to provide comprehensive research.
 
313
  Research Query: {research_query}
 
314
  Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
315
  Make sure to cite sources when making specific claims.
 
316
  Scraped Content:
317
  {json.dumps(scraped_content, indent=2)}
 
318
  Please provide:
319
  1. A comprehensive analysis of the topic
320
  2. Key findings and insights
321
  3. Supporting evidence from the sources
322
  4. Any additional considerations or caveats
 
323
  Format your response in markdown with proper headings and citations."""
324
 
325
  response = requests.post(