Spaces:

Pamudu13
/

automatedblogpostcreater

Sleeping

App Files Files Community

Pamudu13 commited on Mar 23

Commit

10afb6c

verified ·

1 Parent(s): b054134

Update web_scraper.py

Browse files

Files changed (1) hide show

web_scraper.py +35 -26

web_scraper.py CHANGED Viewed

@@ -10,10 +10,15 @@ import base64
 from io import BytesIO
 from googlesearch import search
 import json
 app = Flask(__name__)
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -48,9 +53,9 @@ def search_images(query, num_images=5):
                 break
             # Skip small thumbnails, icons, and low-quality images
-            if ('gstatic.com' in img_url or
-                'google.com' in img_url or
-                'icon' in img_url.lower() or
                 'thumb' in img_url.lower() or
                 'small' in img_url.lower()):
                 continue
@@ -67,7 +72,7 @@ def search_images(query, num_images=5):
                         })
             except Exception as e:
-                print(f"Error checking image URL: {str(e)}")
                 continue
             # Add a small delay between checks
@@ -76,23 +81,24 @@ def search_images(query, num_images=5):
         return results
     except Exception as e:
-        print(f"An error occurred: {str(e)}")
         return []
 def get_cover_image(query):
     """Get a high-quality cover image URL for a given query"""
     try:
         # Search for images
         images = search_images(query, num_images=3)  # Get top 3 images to choose from
         if not images:
             return None
         # Return the first valid image URL
         return images[0]['url']
     except Exception as e:
-        print(f"Error getting cover image: {str(e)}")
         return None
 @app.route('/search_images', methods=['GET'])
@@ -124,6 +130,7 @@ def api_search_images():
         }), 500
 def scrape_site_content(query, num_sites=5):
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -141,7 +148,7 @@ def scrape_site_content(query, num_sites=5):
     try:
         # Get more URLs than needed to account for failures
         search_results = list(search(query, num_results=num_sites * 2))
         # Process each found URL
         for url in search_results:
             if scraped >= num_sites:
@@ -151,10 +158,10 @@ def scrape_site_content(query, num_sites=5):
             for attempt in range(retries):
                 try:
                     # Get the HTML content
-                    print(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     response = requests.get(
-                        url,
-                        headers=headers,
                         timeout=timeout,
                         verify=False  # Skip SSL verification
                     )
@@ -163,7 +170,7 @@ def scrape_site_content(query, num_sites=5):
                     # Verify it's HTML content
                     content_type = response.headers.get('Content-Type', '').lower()
                     if 'text/html' not in content_type:
-                        print(f"Skipping {url} - not HTML content")
                         break
                     # Parse the HTML content
@@ -175,10 +182,10 @@ def scrape_site_content(query, num_sites=5):
                     # Extract text content (limit to first 10000 characters)
                     text_content = soup.get_text(separator='\n', strip=True)[:10000]
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
-                        print(f"Skipping {url} - not enough content")
                         break
                     # Extract all links (limit to first 10)
@@ -220,14 +227,14 @@ def scrape_site_content(query, num_sites=5):
                     break  # Break retry loop on success
                 except requests.Timeout:
-                    print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
                     if attempt == retries - 1:  # Last attempt
-                        print(f"Skipping {url} after {retries} timeout attempts")
                 except requests.RequestException as e:
-                    print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
                     if attempt == retries - 1:  # Last attempt
-                        print(f"Skipping {url} after {retries} failed attempts")
                 # Add a longer delay between retries
                 if not success and attempt < retries - 1:
                     time.sleep(random.uniform(1, 2))
@@ -239,7 +246,7 @@ def scrape_site_content(query, num_sites=5):
         return results
     except Exception as e:
-        print(f"Error in search/scraping process: {str(e)}")
         # Return whatever results we've managed to gather
         return results
@@ -272,6 +279,7 @@ def api_scrape_sites():
         }), 500
 def analyze_with_gpt(scraped_content, research_query, openrouter_key):
     """Analyze scraped content using OpenRouter's Gemini model"""
     try:
         headers = {
@@ -317,15 +325,16 @@ Format your response in markdown with proper headings and citations."""
         return response.json()['choices'][0]['message']['content']
     except Exception as e:
-        print(f"Error in analyze_with_gpt: {str(e)}")
         return f"Error analyzing content: {str(e)}"
 def research_topic(query, num_sites=5, openrouter_key=None):
     """Research a topic using web scraping and GPT analysis"""
     try:
         # First get web content using existing scrape_site_content function
         scraped_results = scrape_site_content(query, num_sites)
         # Format scraped content for analysis
         formatted_content = []
         for result in scraped_results:
@@ -338,10 +347,10 @@ def research_topic(query, num_sites=5, openrouter_key=None):
                     'keywords': result['meta_keywords']
                 }
             })
         # Get AI analysis of the scraped content
         analysis = analyze_with_gpt(formatted_content, query, openrouter_key) if openrouter_key else "No OpenRouter API key provided for analysis"
         return {
             'success': True,
             'query': query,

 from io import BytesIO
 from googlesearch import search
 import json
+import logging
 app = Flask(__name__)
+# Get the logger instance
+logger = logging.getLogger(__name__)
 def search_images(query, num_images=5):
+    logger.info(f"Searching for images with query: {query}")
     # Headers to mimic a browser request
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                 break
             # Skip small thumbnails, icons, and low-quality images
+            if ('gstatic.com' in img_url or
+                'google.com' in img_url or
+                'icon' in img_url.lower() or
                 'thumb' in img_url.lower() or
                 'small' in img_url.lower()):
                 continue
                         })
             except Exception as e:
+                logger.error(f"Error checking image URL: {str(e)}")
                 continue
             # Add a small delay between checks
         return results
     except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
         return []
 def get_cover_image(query):
+    logger.info(f"Getting cover image for query: {query}")
     """Get a high-quality cover image URL for a given query"""
     try:
         # Search for images
         images = search_images(query, num_images=3)  # Get top 3 images to choose from
         if not images:
             return None
         # Return the first valid image URL
         return images[0]['url']
     except Exception as e:
+        logger.error(f"Error getting cover image: {str(e)}")
         return None
 @app.route('/search_images', methods=['GET'])
         }), 500
 def scrape_site_content(query, num_sites=5):
+    logger.info(f"Scraping content for query: {query} from {num_sites} sites")
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     try:
         # Get more URLs than needed to account for failures
         search_results = list(search(query, num_results=num_sites * 2))
         # Process each found URL
         for url in search_results:
             if scraped >= num_sites:
             for attempt in range(retries):
                 try:
                     # Get the HTML content
+                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     response = requests.get(
+                        url,
+                        headers=headers,
                         timeout=timeout,
                         verify=False  # Skip SSL verification
                     )
                     # Verify it's HTML content
                     content_type = response.headers.get('Content-Type', '').lower()
                     if 'text/html' not in content_type:
+                        logger.warning(f"Skipping {url} - not HTML content")
                         break
                     # Parse the HTML content
                     # Extract text content (limit to first 10000 characters)
                     text_content = soup.get_text(separator='\n', strip=True)[:10000]
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
+                        logger.warning(f"Skipping {url} - not enough content")
                         break
                     # Extract all links (limit to first 10)
                     break  # Break retry loop on success
                 except requests.Timeout:
+                    logger.warning(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
                     if attempt == retries - 1:  # Last attempt
+                        logger.error(f"Skipping {url} after {retries} timeout attempts")
                 except requests.RequestException as e:
+                    logger.error(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
                     if attempt == retries - 1:  # Last attempt
+                        logger.error(f"Skipping {url} after {retries} failed attempts")
                 # Add a longer delay between retries
                 if not success and attempt < retries - 1:
                     time.sleep(random.uniform(1, 2))
         return results
     except Exception as e:
+        logger.error(f"Error in search/scraping process: {str(e)}")
         # Return whatever results we've managed to gather
         return results
         }), 500
 def analyze_with_gpt(scraped_content, research_query, openrouter_key):
+    logger.info(f"Analyzing content with GPT for query: {research_query}")
     """Analyze scraped content using OpenRouter's Gemini model"""
     try:
         headers = {
         return response.json()['choices'][0]['message']['content']
     except Exception as e:
+        logger.error(f"Error in analyze_with_gpt: {str(e)}")
         return f"Error analyzing content: {str(e)}"
 def research_topic(query, num_sites=5, openrouter_key=None):
+    logger.info(f"Starting research for topic: {query}")
     """Research a topic using web scraping and GPT analysis"""
     try:
         # First get web content using existing scrape_site_content function
         scraped_results = scrape_site_content(query, num_sites)
         # Format scraped content for analysis
         formatted_content = []
         for result in scraped_results:
                     'keywords': result['meta_keywords']
                 }
             })
         # Get AI analysis of the scraped content
         analysis = analyze_with_gpt(formatted_content, query, openrouter_key) if openrouter_key else "No OpenRouter API key provided for analysis"
         return {
             'success': True,
             'query': query,