Pamudu13 commited on
Commit
055c17c
·
verified ·
1 Parent(s): 9384799

Update web_scraper.py

Browse files
Files changed (1) hide show
  1. web_scraper.py +261 -20
web_scraper.py CHANGED
@@ -1,5 +1,6 @@
1
  from flask import Flask, jsonify, request
2
  import requests
 
3
  from bs4 import BeautifulSoup
4
  import os
5
  import re
@@ -10,15 +11,259 @@ import base64
10
  from io import BytesIO
11
  from googlesearch import search
12
  import json
13
- import logging
 
14
 
15
  app = Flask(__name__)
16
 
17
- # Get the logger instance
18
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def search_images(query, num_images=5):
21
- logger.info(f"Searching for images with query: {query}")
22
  # Headers to mimic a browser request
23
  headers = {
24
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -72,7 +317,7 @@ def search_images(query, num_images=5):
72
  })
73
 
74
  except Exception as e:
75
- logger.error(f"Error checking image URL: {str(e)}")
76
  continue
77
 
78
  # Add a small delay between checks
@@ -81,11 +326,10 @@ def search_images(query, num_images=5):
81
  return results
82
 
83
  except Exception as e:
84
- logger.error(f"An error occurred: {str(e)}")
85
  return []
86
 
87
  def get_cover_image(query):
88
- logger.info(f"Getting cover image for query: {query}")
89
  """Get a high-quality cover image URL for a given query"""
90
  try:
91
  # Search for images
@@ -98,7 +342,7 @@ def get_cover_image(query):
98
  return images[0]['url']
99
 
100
  except Exception as e:
101
- logger.error(f"Error getting cover image: {str(e)}")
102
  return None
103
 
104
  @app.route('/search_images', methods=['GET'])
@@ -130,7 +374,6 @@ def api_search_images():
130
  }), 500
131
 
132
  def scrape_site_content(query, num_sites=5):
133
- logger.info(f"Scraping content for query: {query} from {num_sites} sites")
134
  headers = {
135
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
136
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -158,7 +401,7 @@ def scrape_site_content(query, num_sites=5):
158
  for attempt in range(retries):
159
  try:
160
  # Get the HTML content
161
- logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
162
  response = requests.get(
163
  url,
164
  headers=headers,
@@ -170,7 +413,7 @@ def scrape_site_content(query, num_sites=5):
170
  # Verify it's HTML content
171
  content_type = response.headers.get('Content-Type', '').lower()
172
  if 'text/html' not in content_type:
173
- logger.warning(f"Skipping {url} - not HTML content")
174
  break
175
 
176
  # Parse the HTML content
@@ -185,7 +428,7 @@ def scrape_site_content(query, num_sites=5):
185
 
186
  # Skip if not enough content
187
  if len(text_content.split()) < 100: # Skip if less than 100 words
188
- logger.warning(f"Skipping {url} - not enough content")
189
  break
190
 
191
  # Extract all links (limit to first 10)
@@ -227,13 +470,13 @@ def scrape_site_content(query, num_sites=5):
227
  break # Break retry loop on success
228
 
229
  except requests.Timeout:
230
- logger.warning(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
231
  if attempt == retries - 1: # Last attempt
232
- logger.error(f"Skipping {url} after {retries} timeout attempts")
233
  except requests.RequestException as e:
234
- logger.error(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
235
  if attempt == retries - 1: # Last attempt
236
- logger.error(f"Skipping {url} after {retries} failed attempts")
237
 
238
  # Add a longer delay between retries
239
  if not success and attempt < retries - 1:
@@ -246,7 +489,7 @@ def scrape_site_content(query, num_sites=5):
246
  return results
247
 
248
  except Exception as e:
249
- logger.error(f"Error in search/scraping process: {str(e)}")
250
  # Return whatever results we've managed to gather
251
  return results
252
 
@@ -279,7 +522,6 @@ def api_scrape_sites():
279
  }), 500
280
 
281
  def analyze_with_gpt(scraped_content, research_query, openrouter_key):
282
- logger.info(f"Analyzing content with GPT for query: {research_query}")
283
  """Analyze scraped content using OpenRouter's Gemini model"""
284
  try:
285
  headers = {
@@ -325,11 +567,10 @@ Format your response in markdown with proper headings and citations."""
325
 
326
  return response.json()['choices'][0]['message']['content']
327
  except Exception as e:
328
- logger.error(f"Error in analyze_with_gpt: {str(e)}")
329
  return f"Error analyzing content: {str(e)}"
330
 
331
  def research_topic(query, num_sites=5, openrouter_key=None):
332
- logger.info(f"Starting research for topic: {query}")
333
  """Research a topic using web scraping and GPT analysis"""
334
  try:
335
  # First get web content using existing scrape_site_content function
 
1
  from flask import Flask, jsonify, request
2
  import requests
3
+ import aiohttp
4
  from bs4 import BeautifulSoup
5
  import os
6
  import re
 
11
  from io import BytesIO
12
  from googlesearch import search
13
  import json
14
+ import asyncio
15
+ from typing import Dict, List
16
 
17
  app = Flask(__name__)
18
 
19
+ async def search_images_async(query: str, num_images: int = 5) -> List[Dict]:
20
+ """Search for images asynchronously"""
21
+ headers = {
22
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
23
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24
+ 'Accept-Language': 'en-US,en;q=0.5',
25
+ 'Accept-Encoding': 'gzip, deflate',
26
+ 'DNT': '1',
27
+ 'Connection': 'keep-alive',
28
+ }
29
+
30
+ formatted_query = urllib.parse.quote(query + " high quality")
31
+ url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
32
+
33
+ try:
34
+ async with aiohttp.ClientSession() as session:
35
+ async with session.get(url, headers=headers, timeout=30) as response:
36
+ if response.status != 200:
37
+ raise Exception(f"Failed to fetch images: {response.status}")
38
+
39
+ content = await response.text()
40
+ image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', content)
41
+ image_urls = list(dict.fromkeys(image_urls))
42
+
43
+ results = []
44
+ for img_url in image_urls:
45
+ if len(results) >= num_images:
46
+ break
47
+
48
+ if ('gstatic.com' in img_url or
49
+ 'google.com' in img_url or
50
+ 'icon' in img_url.lower() or
51
+ 'thumb' in img_url.lower() or
52
+ 'small' in img_url.lower()):
53
+ continue
54
+
55
+ try:
56
+ async with session.head(img_url, headers=headers, timeout=5) as img_response:
57
+ if img_response.status == 200:
58
+ content_type = img_response.headers.get('Content-Type', '')
59
+ if content_type.startswith('image/'):
60
+ results.append({
61
+ 'url': img_url,
62
+ 'content_type': content_type
63
+ })
64
+
65
+ except Exception as e:
66
+ print(f"Error checking image URL: {str(e)}")
67
+ continue
68
+
69
+ await asyncio.sleep(random.uniform(0.2, 0.5))
70
+
71
+ return results
72
+
73
+ except Exception as e:
74
+ print(f"An error occurred in search_images_async: {str(e)}")
75
+ return []
76
+
77
+ async def get_cover_image_async(query: str) -> str:
78
+ """Get a high-quality cover image URL for a given query asynchronously"""
79
+ try:
80
+ images = await search_images_async(query, num_images=3)
81
+ if not images:
82
+ return None
83
+ return images[0]['url']
84
+ except Exception as e:
85
+ print(f"Error in get_cover_image_async: {str(e)}")
86
+ return None
87
+
88
+ async def scrape_site_content_async(query: str, num_sites: int = 5, session: aiohttp.ClientSession = None) -> List[Dict]:
89
+ """Scrape website content asynchronously"""
90
+ headers = {
91
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
92
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
93
+ 'Accept-Language': 'en-US,en;q=0.5',
94
+ 'Accept-Encoding': 'gzip, deflate',
95
+ 'DNT': '1',
96
+ 'Connection': 'keep-alive',
97
+ }
98
+
99
+ results = []
100
+ scraped = 0
101
+ retries = 2
102
+ timeout = aiohttp.ClientTimeout(total=5)
103
+
104
+ try:
105
+ # Get search results synchronously (googlesearch-python doesn't support async)
106
+ search_results = list(search(query, num=num_sites * 2))
107
+
108
+ should_close_session = False
109
+ if session is None:
110
+ session = aiohttp.ClientSession()
111
+ should_close_session = True
112
+
113
+ try:
114
+ for url in search_results:
115
+ if scraped >= num_sites:
116
+ break
117
+
118
+ success = False
119
+ for attempt in range(retries):
120
+ try:
121
+ async with session.get(url, headers=headers, timeout=timeout, ssl=False) as response:
122
+ if response.status != 200:
123
+ continue
124
+
125
+ content_type = response.headers.get('Content-Type', '').lower()
126
+ if 'text/html' not in content_type:
127
+ break
128
+
129
+ text = await response.text()
130
+ soup = BeautifulSoup(text, 'html.parser')
131
+
132
+ for script in soup(["script", "style"]):
133
+ script.decompose()
134
+
135
+ text_content = soup.get_text(separator='\n', strip=True)[:10000]
136
+
137
+ if len(text_content.split()) < 100:
138
+ break
139
+
140
+ links = []
141
+ for link in soup.find_all('a', href=True)[:10]:
142
+ href = link['href']
143
+ if href.startswith('http'):
144
+ links.append({
145
+ 'text': link.get_text(strip=True),
146
+ 'url': href
147
+ })
148
+
149
+ title = soup.title.string if soup.title else ''
150
+ meta_description = ''
151
+ meta_keywords = ''
152
+
153
+ meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
154
+ if meta_desc_tag:
155
+ meta_description = meta_desc_tag.get('content', '')
156
+
157
+ meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
158
+ if meta_keywords_tag:
159
+ meta_keywords = meta_keywords_tag.get('content', '')
160
+
161
+ results.append({
162
+ 'url': url,
163
+ 'title': title,
164
+ 'meta_description': meta_description,
165
+ 'meta_keywords': meta_keywords,
166
+ 'text_content': text_content,
167
+ 'links': links
168
+ })
169
+
170
+ scraped += 1
171
+ success = True
172
+ await asyncio.sleep(random.uniform(0.5, 1))
173
+ break
174
+
175
+ except Exception as e:
176
+ print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
177
+ if attempt == retries - 1:
178
+ print(f"Skipping {url} after {retries} failed attempts")
179
+
180
+ if not success and attempt < retries - 1:
181
+ await asyncio.sleep(random.uniform(1, 2))
182
+
183
+ finally:
184
+ if should_close_session:
185
+ await session.close()
186
+
187
+ return results
188
+
189
+ except Exception as e:
190
+ print(f"Error in scrape_site_content_async: {str(e)}")
191
+ return results
192
+
193
+ async def research_topic_async(query: str, num_sites: int = 5, openrouter_key: str = None, session: aiohttp.ClientSession = None) -> Dict:
194
+ """Research a topic using web scraping and GPT analysis asynchronously"""
195
+ try:
196
+ # First get web content using async scrape_site_content function
197
+ scraped_results = await scrape_site_content_async(query, num_sites, session)
198
+
199
+ # Format scraped content for analysis
200
+ formatted_content = []
201
+ for result in scraped_results:
202
+ formatted_content.append({
203
+ 'source': result['url'],
204
+ 'title': result['title'],
205
+ 'content': result['text_content'][:2000],
206
+ 'meta_info': {
207
+ 'description': result['meta_description'],
208
+ 'keywords': result['meta_keywords']
209
+ }
210
+ })
211
+
212
+ # Get AI analysis of the scraped content
213
+ if openrouter_key:
214
+ async with aiohttp.ClientSession() as analysis_session:
215
+ async with analysis_session.post(
216
+ 'https://openrouter.ai/api/v1/chat/completions',
217
+ headers={
218
+ 'Authorization': f'Bearer {openrouter_key}',
219
+ 'HTTP-Referer': 'http://localhost:5001',
220
+ 'X-Title': 'Research Assistant'
221
+ },
222
+ json={
223
+ 'model': 'google/gemini-2.0-flash-thinking-exp:free',
224
+ 'messages': [{
225
+ 'role': 'user',
226
+ 'content': f"""You are a research assistant analyzing web content to provide comprehensive research.
227
+
228
+ Research Query: {query}
229
+
230
+ Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
231
+ Make sure to cite sources when making specific claims.
232
+
233
+ Scraped Content:
234
+ {json.dumps(formatted_content, indent=2)}
235
+
236
+ Please provide:
237
+ 1. A comprehensive analysis of the topic
238
+ 2. Key findings and insights
239
+ 3. Supporting evidence from the sources
240
+ 4. Any additional considerations or caveats
241
+
242
+ Format your response in markdown with proper headings and citations."""
243
+ }]
244
+ }
245
+ ) as response:
246
+ if response.status != 200:
247
+ raise Exception(f"OpenRouter API error: {await response.text()}")
248
+
249
+ response_data = await response.json()
250
+ analysis = response_data['choices'][0]['message']['content']
251
+ else:
252
+ analysis = "No OpenRouter API key provided for analysis"
253
+
254
+ return {
255
+ 'success': True,
256
+ 'query': query,
257
+ 'analysis': analysis,
258
+ 'sources': formatted_content
259
+ }
260
+ except Exception as e:
261
+ return {
262
+ 'success': False,
263
+ 'error': str(e)
264
+ }
265
 
266
  def search_images(query, num_images=5):
 
267
  # Headers to mimic a browser request
268
  headers = {
269
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
 
317
  })
318
 
319
  except Exception as e:
320
+ print(f"Error checking image URL: {str(e)}")
321
  continue
322
 
323
  # Add a small delay between checks
 
326
  return results
327
 
328
  except Exception as e:
329
+ print(f"An error occurred: {str(e)}")
330
  return []
331
 
332
  def get_cover_image(query):
 
333
  """Get a high-quality cover image URL for a given query"""
334
  try:
335
  # Search for images
 
342
  return images[0]['url']
343
 
344
  except Exception as e:
345
+ print(f"Error getting cover image: {str(e)}")
346
  return None
347
 
348
  @app.route('/search_images', methods=['GET'])
 
374
  }), 500
375
 
376
  def scrape_site_content(query, num_sites=5):
 
377
  headers = {
378
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
379
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 
401
  for attempt in range(retries):
402
  try:
403
  # Get the HTML content
404
+ print(f"Trying {url} (attempt {attempt + 1}/{retries})")
405
  response = requests.get(
406
  url,
407
  headers=headers,
 
413
  # Verify it's HTML content
414
  content_type = response.headers.get('Content-Type', '').lower()
415
  if 'text/html' not in content_type:
416
+ print(f"Skipping {url} - not HTML content")
417
  break
418
 
419
  # Parse the HTML content
 
428
 
429
  # Skip if not enough content
430
  if len(text_content.split()) < 100: # Skip if less than 100 words
431
+ print(f"Skipping {url} - not enough content")
432
  break
433
 
434
  # Extract all links (limit to first 10)
 
470
  break # Break retry loop on success
471
 
472
  except requests.Timeout:
473
+ print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
474
  if attempt == retries - 1: # Last attempt
475
+ print(f"Skipping {url} after {retries} timeout attempts")
476
  except requests.RequestException as e:
477
+ print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
478
  if attempt == retries - 1: # Last attempt
479
+ print(f"Skipping {url} after {retries} failed attempts")
480
 
481
  # Add a longer delay between retries
482
  if not success and attempt < retries - 1:
 
489
  return results
490
 
491
  except Exception as e:
492
+ print(f"Error in search/scraping process: {str(e)}")
493
  # Return whatever results we've managed to gather
494
  return results
495
 
 
522
  }), 500
523
 
524
  def analyze_with_gpt(scraped_content, research_query, openrouter_key):
 
525
  """Analyze scraped content using OpenRouter's Gemini model"""
526
  try:
527
  headers = {
 
567
 
568
  return response.json()['choices'][0]['message']['content']
569
  except Exception as e:
570
+ print(f"Error in analyze_with_gpt: {str(e)}")
571
  return f"Error analyzing content: {str(e)}"
572
 
573
  def research_topic(query, num_sites=5, openrouter_key=None):
 
574
  """Research a topic using web scraping and GPT analysis"""
575
  try:
576
  # First get web content using existing scrape_site_content function