Spaces:

Pamudu13
/

automatedblogpostcreater

Sleeping

App Files Files Community

Pamudu13 commited on Mar 23

Commit

efa1440

verified ·

1 Parent(s): fb6ea6a

Update web_scraper.py

Browse files

Files changed (1) hide show

web_scraper.py +14 -10

web_scraper.py CHANGED Viewed

@@ -11,8 +11,12 @@ from io import BytesIO
 from googlesearch import search
 import json
 import logging
-app = Flask(__name__)
 # Create a queue for log messages
 log_queue = queue.Queue()
@@ -23,7 +27,7 @@ class QueueHandler(logging.Handler):
         log_entry = self.format(record)
         log_queue.put(log_entry)
 logger = logging.getLogger()
 queue_handler = QueueHandler()
 queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
@@ -31,6 +35,11 @@ queue_handler.addFilter(SocketWarningFilter())  # Add the filter to the handler
 logger.addHandler(queue_handler)
 logger.setLevel(logging.INFO)
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
     headers = {
@@ -169,7 +178,7 @@ def scrape_site_content(query, num_sites=5):
             for attempt in range(retries):
                 try:
                     # Get the HTML content
-                    print(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     logger.info(f"Scraping URL: {url}")
                     response = requests.get(
                         url,
@@ -182,7 +191,7 @@ def scrape_site_content(query, num_sites=5):
                     # Verify it's HTML content
                     content_type = response.headers.get('Content-Type', '').lower()
                     if 'text/html' not in content_type:
-                        print(f"Skipping {url} - not HTML content")
                         break
                     # Parse the HTML content
@@ -197,7 +206,7 @@ def scrape_site_content(query, num_sites=5):
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
-                        print(f"Skipping {url} - not enough content")
                         break
                     # Extract all links (limit to first 10)
@@ -301,21 +310,16 @@ def analyze_with_gpt(scraped_content, research_query, openrouter_key):
         # Prepare the prompt
         prompt = f"""You are a research assistant analyzing web content to provide comprehensive research.
 Research Query: {research_query}
 Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
 Make sure to cite sources when making specific claims.
 Scraped Content:
 {json.dumps(scraped_content, indent=2)}
 Please provide:
 1. A comprehensive analysis of the topic
 2. Key findings and insights
 3. Supporting evidence from the sources
 4. Any additional considerations or caveats
 Format your response in markdown with proper headings and citations."""
         response = requests.post(

 from googlesearch import search
 import json
 import logging
+import queue
+# Create a logging filter to suppress socket warnings
+class SocketWarningFilter(logging.Filter):
+    def filter(self, record):
+        return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
 # Create a queue for log messages
 log_queue = queue.Queue()
         log_entry = self.format(record)
         log_queue.put(log_entry)
+# Set up logging with the custom handler
 logger = logging.getLogger()
 queue_handler = QueueHandler()
 queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
 logger.addHandler(queue_handler)
 logger.setLevel(logging.INFO)
+# Also add the filter to the root logger to catch all socket warnings
+logging.getLogger().addFilter(SocketWarningFilter())
+app = Flask(__name__)
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
     headers = {
             for attempt in range(retries):
                 try:
                     # Get the HTML content
+                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     logger.info(f"Scraping URL: {url}")
                     response = requests.get(
                         url,
                     # Verify it's HTML content
                     content_type = response.headers.get('Content-Type', '').lower()
                     if 'text/html' not in content_type:
+                        logger.info(f"Skipping {url} - not HTML content")
                         break
                     # Parse the HTML content
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
+                        logger.info(f"Skipping {url} - not enough content")
                         break
                     # Extract all links (limit to first 10)
         # Prepare the prompt
         prompt = f"""You are a research assistant analyzing web content to provide comprehensive research.
 Research Query: {research_query}
 Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
 Make sure to cite sources when making specific claims.
 Scraped Content:
 {json.dumps(scraped_content, indent=2)}
 Please provide:
 1. A comprehensive analysis of the topic
 2. Key findings and insights
 3. Supporting evidence from the sources
 4. Any additional considerations or caveats
 Format your response in markdown with proper headings and citations."""
         response = requests.post(