Pamudu13 commited on
Commit
b556016
·
verified ·
1 Parent(s): aa867d9

Update web_scraper.py

Browse files
Files changed (1) hide show
  1. web_scraper.py +0 -250
web_scraper.py CHANGED
@@ -1,6 +1,5 @@
1
  from flask import Flask, jsonify, request
2
  import requests
3
- import aiohttp
4
  from bs4 import BeautifulSoup
5
  import os
6
  import re
@@ -11,258 +10,9 @@ import base64
11
  from io import BytesIO
12
  from googlesearch import search
13
  import json
14
- import asyncio
15
- from typing import Dict, List
16
 
17
  app = Flask(__name__)
18
 
19
- async def search_images_async(query: str, num_images: int = 5) -> List[Dict]:
20
- """Search for images asynchronously"""
21
- headers = {
22
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
23
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24
- 'Accept-Language': 'en-US,en;q=0.5',
25
- 'Accept-Encoding': 'gzip, deflate',
26
- 'DNT': '1',
27
- 'Connection': 'keep-alive',
28
- }
29
-
30
- formatted_query = urllib.parse.quote(query + " high quality")
31
- url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
32
-
33
- try:
34
- async with aiohttp.ClientSession() as session:
35
- async with session.get(url, headers=headers, timeout=30) as response:
36
- if response.status != 200:
37
- raise Exception(f"Failed to fetch images: {response.status}")
38
-
39
- content = await response.text()
40
- image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', content)
41
- image_urls = list(dict.fromkeys(image_urls))
42
-
43
- results = []
44
- for img_url in image_urls:
45
- if len(results) >= num_images:
46
- break
47
-
48
- if ('gstatic.com' in img_url or
49
- 'google.com' in img_url or
50
- 'icon' in img_url.lower() or
51
- 'thumb' in img_url.lower() or
52
- 'small' in img_url.lower()):
53
- continue
54
-
55
- try:
56
- async with session.head(img_url, headers=headers, timeout=5) as img_response:
57
- if img_response.status == 200:
58
- content_type = img_response.headers.get('Content-Type', '')
59
- if content_type.startswith('image/'):
60
- results.append({
61
- 'url': img_url,
62
- 'content_type': content_type
63
- })
64
-
65
- except Exception as e:
66
- print(f"Error checking image URL: {str(e)}")
67
- continue
68
-
69
- await asyncio.sleep(random.uniform(0.2, 0.5))
70
-
71
- return results
72
-
73
- except Exception as e:
74
- print(f"An error occurred in search_images_async: {str(e)}")
75
- return []
76
-
77
- async def get_cover_image_async(query: str) -> str:
78
- """Get a high-quality cover image URL for a given query asynchronously"""
79
- try:
80
- images = await search_images_async(query, num_images=3)
81
- if not images:
82
- return None
83
- return images[0]['url']
84
- except Exception as e:
85
- print(f"Error in get_cover_image_async: {str(e)}")
86
- return None
87
-
88
- async def scrape_site_content_async(query: str, num_sites: int = 5, session: aiohttp.ClientSession = None) -> List[Dict]:
89
- """Scrape website content asynchronously"""
90
- headers = {
91
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
92
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
93
- 'Accept-Language': 'en-US,en;q=0.5',
94
- 'Accept-Encoding': 'gzip, deflate',
95
- 'DNT': '1',
96
- 'Connection': 'keep-alive',
97
- }
98
-
99
- results = []
100
- scraped = 0
101
- retries = 2
102
- timeout = aiohttp.ClientTimeout(total=5)
103
-
104
- try:
105
- # Get search results synchronously (googlesearch-python doesn't support async)
106
- search_results = list(search(query, num=num_sites * 2))
107
-
108
- should_close_session = False
109
- if session is None:
110
- session = aiohttp.ClientSession()
111
- should_close_session = True
112
-
113
- try:
114
- for url in search_results:
115
- if scraped >= num_sites:
116
- break
117
-
118
- success = False
119
- for attempt in range(retries):
120
- try:
121
- async with session.get(url, headers=headers, timeout=timeout, ssl=False) as response:
122
- if response.status != 200:
123
- continue
124
-
125
- content_type = response.headers.get('Content-Type', '').lower()
126
- if 'text/html' not in content_type:
127
- break
128
-
129
- text = await response.text()
130
- soup = BeautifulSoup(text, 'html.parser')
131
-
132
- for script in soup(["script", "style"]):
133
- script.decompose()
134
-
135
- text_content = soup.get_text(separator='\n', strip=True)[:10000]
136
-
137
- if len(text_content.split()) < 100:
138
- break
139
-
140
- links = []
141
- for link in soup.find_all('a', href=True)[:10]:
142
- href = link['href']
143
- if href.startswith('http'):
144
- links.append({
145
- 'text': link.get_text(strip=True),
146
- 'url': href
147
- })
148
-
149
- title = soup.title.string if soup.title else ''
150
- meta_description = ''
151
- meta_keywords = ''
152
-
153
- meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
154
- if meta_desc_tag:
155
- meta_description = meta_desc_tag.get('content', '')
156
-
157
- meta_keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
158
- if meta_keywords_tag:
159
- meta_keywords = meta_keywords_tag.get('content', '')
160
-
161
- results.append({
162
- 'url': url,
163
- 'title': title,
164
- 'meta_description': meta_description,
165
- 'meta_keywords': meta_keywords,
166
- 'text_content': text_content,
167
- 'links': links
168
- })
169
-
170
- scraped += 1
171
- success = True
172
- await asyncio.sleep(random.uniform(0.5, 1))
173
- break
174
-
175
- except Exception as e:
176
- print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
177
- if attempt == retries - 1:
178
- print(f"Skipping {url} after {retries} failed attempts")
179
-
180
- if not success and attempt < retries - 1:
181
- await asyncio.sleep(random.uniform(1, 2))
182
-
183
- finally:
184
- if should_close_session:
185
- await session.close()
186
-
187
- return results
188
-
189
- except Exception as e:
190
- print(f"Error in scrape_site_content_async: {str(e)}")
191
- return results
192
-
193
- async def research_topic_async(query: str, num_sites: int = 5, openrouter_key: str = None, session: aiohttp.ClientSession = None) -> Dict:
194
- """Research a topic using web scraping and GPT analysis asynchronously"""
195
- try:
196
- # First get web content using async scrape_site_content function
197
- scraped_results = await scrape_site_content_async(query, num_sites, session)
198
-
199
- # Format scraped content for analysis
200
- formatted_content = []
201
- for result in scraped_results:
202
- formatted_content.append({
203
- 'source': result['url'],
204
- 'title': result['title'],
205
- 'content': result['text_content'][:2000],
206
- 'meta_info': {
207
- 'description': result['meta_description'],
208
- 'keywords': result['meta_keywords']
209
- }
210
- })
211
-
212
- # Get AI analysis of the scraped content
213
- if openrouter_key:
214
- async with aiohttp.ClientSession() as analysis_session:
215
- async with analysis_session.post(
216
- 'https://openrouter.ai/api/v1/chat/completions',
217
- headers={
218
- 'Authorization': f'Bearer {openrouter_key}',
219
- 'HTTP-Referer': 'http://localhost:5001',
220
- 'X-Title': 'Research Assistant'
221
- },
222
- json={
223
- 'model': 'google/gemini-2.0-flash-thinking-exp:free',
224
- 'messages': [{
225
- 'role': 'user',
226
- 'content': f"""You are a research assistant analyzing web content to provide comprehensive research.
227
-
228
- Research Query: {query}
229
-
230
- Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
231
- Make sure to cite sources when making specific claims.
232
-
233
- Scraped Content:
234
- {json.dumps(formatted_content, indent=2)}
235
-
236
- Please provide:
237
- 1. A comprehensive analysis of the topic
238
- 2. Key findings and insights
239
- 3. Supporting evidence from the sources
240
- 4. Any additional considerations or caveats
241
-
242
- Format your response in markdown with proper headings and citations."""
243
- }]
244
- }
245
- ) as response:
246
- if response.status != 200:
247
- raise Exception(f"OpenRouter API error: {await response.text()}")
248
-
249
- response_data = await response.json()
250
- analysis = response_data['choices'][0]['message']['content']
251
- else:
252
- analysis = "No OpenRouter API key provided for analysis"
253
-
254
- return {
255
- 'success': True,
256
- 'query': query,
257
- 'analysis': analysis,
258
- 'sources': formatted_content
259
- }
260
- except Exception as e:
261
- return {
262
- 'success': False,
263
- 'error': str(e)
264
- }
265
-
266
  def search_images(query, num_images=5):
267
  # Headers to mimic a browser request
268
  headers = {
 
1
  from flask import Flask, jsonify, request
2
  import requests
 
3
  from bs4 import BeautifulSoup
4
  import os
5
  import re
 
10
  from io import BytesIO
11
  from googlesearch import search
12
  import json
 
 
13
 
14
  app = Flask(__name__)
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def search_images(query, num_images=5):
17
  # Headers to mimic a browser request
18
  headers = {