Spaces:

Pamudu13
/

automatedblogpostcreater

Sleeping

App Files Files Community

Pamudu13 commited on Mar 23

Commit

b556016

verified ·

1 Parent(s): aa867d9

Update web_scraper.py

Browse files

Files changed (1) hide show

web_scraper.py +0 -250

web_scraper.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from flask import Flask, jsonify, request
 import requests
-import aiohttp
 from bs4 import BeautifulSoup
 import os
 import re
@@ -11,258 +10,9 @@ import base64
 from io import BytesIO
 from googlesearch import search
 import json
-import asyncio
-from typing import Dict, List
 app = Flask(__name__)
-async def search_images_async(query: str, num_images: int = 5) -> List[Dict]:
-    """Search for images asynchronously"""
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-    }
-    formatted_query = urllib.parse.quote(query + " high quality")
-    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
-    try:
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url, headers=headers, timeout=30) as response:
-                if response.status != 200:
-                    raise Exception(f"Failed to fetch images: {response.status}")
-                content = await response.text()
-                image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', content)
-                image_urls = list(dict.fromkeys(image_urls))
-                results = []
-                for img_url in image_urls:
-                    if len(results) >= num_images:
-                        break
-                    if ('gstatic.com' in img_url or
-                        'google.com' in img_url or
-                        'icon' in img_url.lower() or
-                        'thumb' in img_url.lower() or
-                        'small' in img_url.lower()):
-                        continue
-                    try:
-                        async with session.head(img_url, headers=headers, timeout=5) as img_response:
-                            if img_response.status == 200:
-                                content_type = img_response.headers.get('Content-Type', '')
-                                if content_type.startswith('image/'):
-                                    results.append({
-                                        'url': img_url,
-                                        'content_type': content_type
-                                    })
-                    except Exception as e:
-                        print(f"Error checking image URL: {str(e)}")
-                        continue
-                    await asyncio.sleep(random.uniform(0.2, 0.5))
-                return results
-    except Exception as e:
-        print(f"An error occurred in search_images_async: {str(e)}")
-        return []
-async def get_cover_image_async(query: str) -> str:
-    """Get a high-quality cover image URL for a given query asynchronously"""
-    try:
-        images = await search_images_async(query, num_images=3)
-        if not images:
-            return None
-        return images[0]['url']
-    except Exception as e:
-        print(f"Error in get_cover_image_async: {str(e)}")
-        return None
-async def scrape_site_content_async(query: str, num_sites: int = 5, session: aiohttp.ClientSession = None) -> List[Dict]:
-    """Scrape website content asynchronously"""
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-    }
-    results = []
-    scraped = 0
-    retries = 2
-    timeout = aiohttp.ClientTimeout(total=5)
-    try:
-        # Get search results synchronously (googlesearch-python doesn't support async)
-        search_results = list(search(query, num=num_sites * 2))
-        should_close_session = False
-        if session is None:
-            session = aiohttp.ClientSession()
-            should_close_session = True
-        try:
-            for url in search_results:
-                if scraped >= num_sites:
-                    break
-                success = False
-                for attempt in range(retries):
-                    try:
-                        async with session.get(url, headers=headers, timeout=timeout, ssl=False) as response:
-                            if response.status != 200:
-                                continue
-                            content_type = response.headers.get('Content-Type', '').lower()
-                            if 'text/html' not in content_type:
-                                break
-                            text = await response.text()
-                            soup = BeautifulSoup(text, 'html.parser')
-                            for script in soup(["script", "style"]):
-                                script.decompose()
-                            text_content = soup.get_text(separator='\n', strip=True)[:10000]
-                            if len(text_content.split()) < 100:
-                                break
-                            links = []
-                            for link in soup.find_all('a', href=True)[:10]:
-                                href = link['href']
-                                if href.startswith('http'):
-                                    links.append({
-                                        'text': link.get_text(strip=True),
-                                        'url': href
-                                    })
-                            title = soup.title.string if soup.title else ''
-                            meta_description = ''
-                            meta_keywords = ''
-                            meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
-                            if meta_desc_tag:
-                                meta_description = meta_desc_tag.get('content', '')
-                            meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
-                            if meta_keywords_tag:
-                                meta_keywords = meta_keywords_tag.get('content', '')
-                            results.append({
-                                'url': url,
-                                'title': title,
-                                'meta_description': meta_description,
-                                'meta_keywords': meta_keywords,
-                                'text_content': text_content,
-                                'links': links
-                            })
-                            scraped += 1
-                            success = True
-                            await asyncio.sleep(random.uniform(0.5, 1))
-                            break
-                    except Exception as e:
-                        print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
-                        if attempt == retries - 1:
-                            print(f"Skipping {url} after {retries} failed attempts")
-                    if not success and attempt < retries - 1:
-                        await asyncio.sleep(random.uniform(1, 2))
-        finally:
-            if should_close_session:
-                await session.close()
-        return results
-    except Exception as e:
-        print(f"Error in scrape_site_content_async: {str(e)}")
-        return results
-async def research_topic_async(query: str, num_sites: int = 5, openrouter_key: str = None, session: aiohttp.ClientSession = None) -> Dict:
-    """Research a topic using web scraping and GPT analysis asynchronously"""
-    try:
-        # First get web content using async scrape_site_content function
-        scraped_results = await scrape_site_content_async(query, num_sites, session)
-        # Format scraped content for analysis
-        formatted_content = []
-        for result in scraped_results:
-            formatted_content.append({
-                'source': result['url'],
-                'title': result['title'],
-                'content': result['text_content'][:2000],
-                'meta_info': {
-                    'description': result['meta_description'],
-                    'keywords': result['meta_keywords']
-                }
-            })
-        # Get AI analysis of the scraped content
-        if openrouter_key:
-            async with aiohttp.ClientSession() as analysis_session:
-                async with analysis_session.post(
-                    'https://openrouter.ai/api/v1/chat/completions',
-                    headers={
-                        'Authorization': f'Bearer {openrouter_key}',
-                        'HTTP-Referer': 'http://localhost:5001',
-                        'X-Title': 'Research Assistant'
-                    },
-                    json={
-                        'model': 'google/gemini-2.0-flash-thinking-exp:free',
-                        'messages': [{
-                            'role': 'user',
-                            'content': f"""You are a research assistant analyzing web content to provide comprehensive research.
-Research Query: {query}
-Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
-Make sure to cite sources when making specific claims.
-Scraped Content:
-{json.dumps(formatted_content, indent=2)}
-Please provide:
-1. A comprehensive analysis of the topic
-2. Key findings and insights
-3. Supporting evidence from the sources
-4. Any additional considerations or caveats
-Format your response in markdown with proper headings and citations."""
-                        }]
-                    }
-                ) as response:
-                    if response.status != 200:
-                        raise Exception(f"OpenRouter API error: {await response.text()}")
-                    response_data = await response.json()
-                    analysis = response_data['choices'][0]['message']['content']
-        else:
-            analysis = "No OpenRouter API key provided for analysis"
-        return {
-            'success': True,
-            'query': query,
-            'analysis': analysis,
-            'sources': formatted_content
-        }
-    except Exception as e:
-        return {
-            'success': False,
-            'error': str(e)
-        }
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
     headers = {

 from flask import Flask, jsonify, request
 import requests
 from bs4 import BeautifulSoup
 import os
 import re
 from io import BytesIO
 from googlesearch import search
 import json
 app = Flask(__name__)
 def search_images(query, num_images=5):
     # Headers to mimic a browser request
     headers = {