Spaces:

Pamudu13
/

automatedblogpostcreater

Sleeping

App Files Files Community

Pamudu13 commited on Mar 23

Commit

055c17c

verified ·

1 Parent(s): 9384799

Update web_scraper.py

Browse files

Files changed (1) hide show

web_scraper.py +261 -20

web_scraper.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from flask import Flask, jsonify, request
 import requests
 from bs4 import BeautifulSoup
 import os
 import re
@@ -10,15 +11,259 @@ import base64
 from io import BytesIO
 from googlesearch import search
 import json
-import logging
 app = Flask(__name__)
-# Get the logger instance
-logger = logging.getLogger(__name__)
 def search_images(query, num_images=5):
-    logger.info(f"Searching for images with query: {query}")
     # Headers to mimic a browser request
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -72,7 +317,7 @@ def search_images(query, num_images=5):
                         })
             except Exception as e:
-                logger.error(f"Error checking image URL: {str(e)}")
                 continue
             # Add a small delay between checks
@@ -81,11 +326,10 @@ def search_images(query, num_images=5):
         return results
     except Exception as e:
-        logger.error(f"An error occurred: {str(e)}")
         return []
 def get_cover_image(query):
-    logger.info(f"Getting cover image for query: {query}")
     """Get a high-quality cover image URL for a given query"""
     try:
         # Search for images
@@ -98,7 +342,7 @@ def get_cover_image(query):
         return images[0]['url']
     except Exception as e:
-        logger.error(f"Error getting cover image: {str(e)}")
         return None
 @app.route('/search_images', methods=['GET'])
@@ -130,7 +374,6 @@ def api_search_images():
         }), 500
 def scrape_site_content(query, num_sites=5):
-    logger.info(f"Scraping content for query: {query} from {num_sites} sites")
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -158,7 +401,7 @@ def scrape_site_content(query, num_sites=5):
             for attempt in range(retries):
                 try:
                     # Get the HTML content
-                    logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     response = requests.get(
                         url,
                         headers=headers,
@@ -170,7 +413,7 @@ def scrape_site_content(query, num_sites=5):
                     # Verify it's HTML content
                     content_type = response.headers.get('Content-Type', '').lower()
                     if 'text/html' not in content_type:
-                        logger.warning(f"Skipping {url} - not HTML content")
                         break
                     # Parse the HTML content
@@ -185,7 +428,7 @@ def scrape_site_content(query, num_sites=5):
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
-                        logger.warning(f"Skipping {url} - not enough content")
                         break
                     # Extract all links (limit to first 10)
@@ -227,13 +470,13 @@ def scrape_site_content(query, num_sites=5):
                     break  # Break retry loop on success
                 except requests.Timeout:
-                    logger.warning(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
                     if attempt == retries - 1:  # Last attempt
-                        logger.error(f"Skipping {url} after {retries} timeout attempts")
                 except requests.RequestException as e:
-                    logger.error(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
                     if attempt == retries - 1:  # Last attempt
-                        logger.error(f"Skipping {url} after {retries} failed attempts")
                 # Add a longer delay between retries
                 if not success and attempt < retries - 1:
@@ -246,7 +489,7 @@ def scrape_site_content(query, num_sites=5):
         return results
     except Exception as e:
-        logger.error(f"Error in search/scraping process: {str(e)}")
         # Return whatever results we've managed to gather
         return results
@@ -279,7 +522,6 @@ def api_scrape_sites():
         }), 500
 def analyze_with_gpt(scraped_content, research_query, openrouter_key):
-    logger.info(f"Analyzing content with GPT for query: {research_query}")
     """Analyze scraped content using OpenRouter's Gemini model"""
     try:
         headers = {
@@ -325,11 +567,10 @@ Format your response in markdown with proper headings and citations."""
         return response.json()['choices'][0]['message']['content']
     except Exception as e:
-        logger.error(f"Error in analyze_with_gpt: {str(e)}")
         return f"Error analyzing content: {str(e)}"
 def research_topic(query, num_sites=5, openrouter_key=None):
-    logger.info(f"Starting research for topic: {query}")
     """Research a topic using web scraping and GPT analysis"""
     try:
         # First get web content using existing scrape_site_content function

 from flask import Flask, jsonify, request
 import requests
+import aiohttp
 from bs4 import BeautifulSoup
 import os
 import re
 from io import BytesIO
 from googlesearch import search
 import json
+import asyncio
+from typing import Dict, List
 app = Flask(__name__)
+async def search_images_async(query: str, num_images: int = 5) -> List[Dict]:
+    """Search for images asynchronously"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+    }
+    formatted_query = urllib.parse.quote(query + " high quality")
+    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, headers=headers, timeout=30) as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to fetch images: {response.status}")
+                content = await response.text()
+                image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', content)
+                image_urls = list(dict.fromkeys(image_urls))
+                results = []
+                for img_url in image_urls:
+                    if len(results) >= num_images:
+                        break
+                    if ('gstatic.com' in img_url or
+                        'google.com' in img_url or
+                        'icon' in img_url.lower() or
+                        'thumb' in img_url.lower() or
+                        'small' in img_url.lower()):
+                        continue
+                    try:
+                        async with session.head(img_url, headers=headers, timeout=5) as img_response:
+                            if img_response.status == 200:
+                                content_type = img_response.headers.get('Content-Type', '')
+                                if content_type.startswith('image/'):
+                                    results.append({
+                                        'url': img_url,
+                                        'content_type': content_type
+                                    })
+                    except Exception as e:
+                        print(f"Error checking image URL: {str(e)}")
+                        continue
+                    await asyncio.sleep(random.uniform(0.2, 0.5))
+                return results
+    except Exception as e:
+        print(f"An error occurred in search_images_async: {str(e)}")
+        return []
+async def get_cover_image_async(query: str) -> str:
+    """Get a high-quality cover image URL for a given query asynchronously"""
+    try:
+        images = await search_images_async(query, num_images=3)
+        if not images:
+            return None
+        return images[0]['url']
+    except Exception as e:
+        print(f"Error in get_cover_image_async: {str(e)}")
+        return None
+async def scrape_site_content_async(query: str, num_sites: int = 5, session: aiohttp.ClientSession = None) -> List[Dict]:
+    """Scrape website content asynchronously"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+    }
+    results = []
+    scraped = 0
+    retries = 2
+    timeout = aiohttp.ClientTimeout(total=5)
+    try:
+        # Get search results synchronously (googlesearch-python doesn't support async)
+        search_results = list(search(query, num=num_sites * 2))
+        should_close_session = False
+        if session is None:
+            session = aiohttp.ClientSession()
+            should_close_session = True
+        try:
+            for url in search_results:
+                if scraped >= num_sites:
+                    break
+                success = False
+                for attempt in range(retries):
+                    try:
+                        async with session.get(url, headers=headers, timeout=timeout, ssl=False) as response:
+                            if response.status != 200:
+                                continue
+                            content_type = response.headers.get('Content-Type', '').lower()
+                            if 'text/html' not in content_type:
+                                break
+                            text = await response.text()
+                            soup = BeautifulSoup(text, 'html.parser')
+                            for script in soup(["script", "style"]):
+                                script.decompose()
+                            text_content = soup.get_text(separator='\n', strip=True)[:10000]
+                            if len(text_content.split()) < 100:
+                                break
+                            links = []
+                            for link in soup.find_all('a', href=True)[:10]:
+                                href = link['href']
+                                if href.startswith('http'):
+                                    links.append({
+                                        'text': link.get_text(strip=True),
+                                        'url': href
+                                    })
+                            title = soup.title.string if soup.title else ''
+                            meta_description = ''
+                            meta_keywords = ''
+                            meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
+                            if meta_desc_tag:
+                                meta_description = meta_desc_tag.get('content', '')
+                            meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
+                            if meta_keywords_tag:
+                                meta_keywords = meta_keywords_tag.get('content', '')
+                            results.append({
+                                'url': url,
+                                'title': title,
+                                'meta_description': meta_description,
+                                'meta_keywords': meta_keywords,
+                                'text_content': text_content,
+                                'links': links
+                            })
+                            scraped += 1
+                            success = True
+                            await asyncio.sleep(random.uniform(0.5, 1))
+                            break
+                    except Exception as e:
+                        print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
+                        if attempt == retries - 1:
+                            print(f"Skipping {url} after {retries} failed attempts")
+                    if not success and attempt < retries - 1:
+                        await asyncio.sleep(random.uniform(1, 2))
+        finally:
+            if should_close_session:
+                await session.close()
+        return results
+    except Exception as e:
+        print(f"Error in scrape_site_content_async: {str(e)}")
+        return results
+async def research_topic_async(query: str, num_sites: int = 5, openrouter_key: str = None, session: aiohttp.ClientSession = None) -> Dict:
+    """Research a topic using web scraping and GPT analysis asynchronously"""
+    try:
+        # First get web content using async scrape_site_content function
+        scraped_results = await scrape_site_content_async(query, num_sites, session)
+        # Format scraped content for analysis
+        formatted_content = []
+        for result in scraped_results:
+            formatted_content.append({
+                'source': result['url'],
+                'title': result['title'],
+                'content': result['text_content'][:2000],
+                'meta_info': {
+                    'description': result['meta_description'],
+                    'keywords': result['meta_keywords']
+                }
+            })
+        # Get AI analysis of the scraped content
+        if openrouter_key:
+            async with aiohttp.ClientSession() as analysis_session:
+                async with analysis_session.post(
+                    'https://openrouter.ai/api/v1/chat/completions',
+                    headers={
+                        'Authorization': f'Bearer {openrouter_key}',
+                        'HTTP-Referer': 'http://localhost:5001',
+                        'X-Title': 'Research Assistant'
+                    },
+                    json={
+                        'model': 'google/gemini-2.0-flash-thinking-exp:free',
+                        'messages': [{
+                            'role': 'user',
+                            'content': f"""You are a research assistant analyzing web content to provide comprehensive research.
+Research Query: {query}
+Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
+Make sure to cite sources when making specific claims.
+Scraped Content:
+{json.dumps(formatted_content, indent=2)}
+Please provide:
+1. A comprehensive analysis of the topic
+2. Key findings and insights
+3. Supporting evidence from the sources
+4. Any additional considerations or caveats
+Format your response in markdown with proper headings and citations."""
+                        }]
+                    }
+                ) as response:
+                    if response.status != 200:
+                        raise Exception(f"OpenRouter API error: {await response.text()}")
+                    response_data = await response.json()
+                    analysis = response_data['choices'][0]['message']['content']
+        else:
+            analysis = "No OpenRouter API key provided for analysis"
+        return {
+            'success': True,
+            'query': query,
+            'analysis': analysis,
+            'sources': formatted_content
+        }
+    except Exception as e:
+        return {
+            'success': False,
+            'error': str(e)
+        }
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                         })
             except Exception as e:
+                print(f"Error checking image URL: {str(e)}")
                 continue
             # Add a small delay between checks
         return results
     except Exception as e:
+        print(f"An error occurred: {str(e)}")
         return []
 def get_cover_image(query):
     """Get a high-quality cover image URL for a given query"""
     try:
         # Search for images
         return images[0]['url']
     except Exception as e:
+        print(f"Error getting cover image: {str(e)}")
         return None
 @app.route('/search_images', methods=['GET'])
         }), 500
 def scrape_site_content(query, num_sites=5):
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             for attempt in range(retries):
                 try:
                     # Get the HTML content
+                    print(f"Trying {url} (attempt {attempt + 1}/{retries})")
                     response = requests.get(
                         url,
                         headers=headers,
                     # Verify it's HTML content
                     content_type = response.headers.get('Content-Type', '').lower()
                     if 'text/html' not in content_type:
+                        print(f"Skipping {url} - not HTML content")
                         break
                     # Parse the HTML content
                     # Skip if not enough content
                     if len(text_content.split()) < 100:  # Skip if less than 100 words
+                        print(f"Skipping {url} - not enough content")
                         break
                     # Extract all links (limit to first 10)
                     break  # Break retry loop on success
                 except requests.Timeout:
+                    print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
                     if attempt == retries - 1:  # Last attempt
+                        print(f"Skipping {url} after {retries} timeout attempts")
                 except requests.RequestException as e:
+                    print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
                     if attempt == retries - 1:  # Last attempt
+                        print(f"Skipping {url} after {retries} failed attempts")
                 # Add a longer delay between retries
                 if not success and attempt < retries - 1:
         return results
     except Exception as e:
+        print(f"Error in search/scraping process: {str(e)}")
         # Return whatever results we've managed to gather
         return results
         }), 500
 def analyze_with_gpt(scraped_content, research_query, openrouter_key):
     """Analyze scraped content using OpenRouter's Gemini model"""
     try:
         headers = {
         return response.json()['choices'][0]['message']['content']
     except Exception as e:
+        print(f"Error in analyze_with_gpt: {str(e)}")
         return f"Error analyzing content: {str(e)}"
 def research_topic(query, num_sites=5, openrouter_key=None):
     """Research a topic using web scraping and GPT analysis"""
     try:
         # First get web content using existing scrape_site_content function