Pamudu13 commited on
Commit
10afb6c
·
verified ·
1 Parent(s): b054134

Update web_scraper.py

Browse files
Files changed (1) hide show
  1. web_scraper.py +35 -26
web_scraper.py CHANGED
@@ -10,10 +10,15 @@ import base64
10
  from io import BytesIO
11
  from googlesearch import search
12
  import json
 
13
 
14
  app = Flask(__name__)
15
 
 
 
 
16
  def search_images(query, num_images=5):
 
17
  # Headers to mimic a browser request
18
  headers = {
19
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
@@ -48,9 +53,9 @@ def search_images(query, num_images=5):
48
  break
49
 
50
  # Skip small thumbnails, icons, and low-quality images
51
- if ('gstatic.com' in img_url or
52
- 'google.com' in img_url or
53
- 'icon' in img_url.lower() or
54
  'thumb' in img_url.lower() or
55
  'small' in img_url.lower()):
56
  continue
@@ -67,7 +72,7 @@ def search_images(query, num_images=5):
67
  })
68
 
69
  except Exception as e:
70
- print(f"Error checking image URL: {str(e)}")
71
  continue
72
 
73
  # Add a small delay between checks
@@ -76,23 +81,24 @@ def search_images(query, num_images=5):
76
  return results
77
 
78
  except Exception as e:
79
- print(f"An error occurred: {str(e)}")
80
  return []
81
 
82
  def get_cover_image(query):
 
83
  """Get a high-quality cover image URL for a given query"""
84
  try:
85
  # Search for images
86
  images = search_images(query, num_images=3) # Get top 3 images to choose from
87
-
88
  if not images:
89
  return None
90
-
91
  # Return the first valid image URL
92
  return images[0]['url']
93
-
94
  except Exception as e:
95
- print(f"Error getting cover image: {str(e)}")
96
  return None
97
 
98
  @app.route('/search_images', methods=['GET'])
@@ -124,6 +130,7 @@ def api_search_images():
124
  }), 500
125
 
126
  def scrape_site_content(query, num_sites=5):
 
127
  headers = {
128
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
129
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -141,7 +148,7 @@ def scrape_site_content(query, num_sites=5):
141
  try:
142
  # Get more URLs than needed to account for failures
143
  search_results = list(search(query, num_results=num_sites * 2))
144
-
145
  # Process each found URL
146
  for url in search_results:
147
  if scraped >= num_sites:
@@ -151,10 +158,10 @@ def scrape_site_content(query, num_sites=5):
151
  for attempt in range(retries):
152
  try:
153
  # Get the HTML content
154
- print(f"Trying {url} (attempt {attempt + 1}/{retries})")
155
  response = requests.get(
156
- url,
157
- headers=headers,
158
  timeout=timeout,
159
  verify=False # Skip SSL verification
160
  )
@@ -163,7 +170,7 @@ def scrape_site_content(query, num_sites=5):
163
  # Verify it's HTML content
164
  content_type = response.headers.get('Content-Type', '').lower()
165
  if 'text/html' not in content_type:
166
- print(f"Skipping {url} - not HTML content")
167
  break
168
 
169
  # Parse the HTML content
@@ -175,10 +182,10 @@ def scrape_site_content(query, num_sites=5):
175
 
176
  # Extract text content (limit to first 10000 characters)
177
  text_content = soup.get_text(separator='\n', strip=True)[:10000]
178
-
179
  # Skip if not enough content
180
  if len(text_content.split()) < 100: # Skip if less than 100 words
181
- print(f"Skipping {url} - not enough content")
182
  break
183
 
184
  # Extract all links (limit to first 10)
@@ -220,14 +227,14 @@ def scrape_site_content(query, num_sites=5):
220
  break # Break retry loop on success
221
 
222
  except requests.Timeout:
223
- print(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
224
  if attempt == retries - 1: # Last attempt
225
- print(f"Skipping {url} after {retries} timeout attempts")
226
  except requests.RequestException as e:
227
- print(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
228
  if attempt == retries - 1: # Last attempt
229
- print(f"Skipping {url} after {retries} failed attempts")
230
-
231
  # Add a longer delay between retries
232
  if not success and attempt < retries - 1:
233
  time.sleep(random.uniform(1, 2))
@@ -239,7 +246,7 @@ def scrape_site_content(query, num_sites=5):
239
  return results
240
 
241
  except Exception as e:
242
- print(f"Error in search/scraping process: {str(e)}")
243
  # Return whatever results we've managed to gather
244
  return results
245
 
@@ -272,6 +279,7 @@ def api_scrape_sites():
272
  }), 500
273
 
274
  def analyze_with_gpt(scraped_content, research_query, openrouter_key):
 
275
  """Analyze scraped content using OpenRouter's Gemini model"""
276
  try:
277
  headers = {
@@ -317,15 +325,16 @@ Format your response in markdown with proper headings and citations."""
317
 
318
  return response.json()['choices'][0]['message']['content']
319
  except Exception as e:
320
- print(f"Error in analyze_with_gpt: {str(e)}")
321
  return f"Error analyzing content: {str(e)}"
322
 
323
  def research_topic(query, num_sites=5, openrouter_key=None):
 
324
  """Research a topic using web scraping and GPT analysis"""
325
  try:
326
  # First get web content using existing scrape_site_content function
327
  scraped_results = scrape_site_content(query, num_sites)
328
-
329
  # Format scraped content for analysis
330
  formatted_content = []
331
  for result in scraped_results:
@@ -338,10 +347,10 @@ def research_topic(query, num_sites=5, openrouter_key=None):
338
  'keywords': result['meta_keywords']
339
  }
340
  })
341
-
342
  # Get AI analysis of the scraped content
343
  analysis = analyze_with_gpt(formatted_content, query, openrouter_key) if openrouter_key else "No OpenRouter API key provided for analysis"
344
-
345
  return {
346
  'success': True,
347
  'query': query,
 
10
  from io import BytesIO
11
  from googlesearch import search
12
  import json
13
+ import logging
14
 
15
  app = Flask(__name__)
16
 
17
+ # Get the logger instance
18
+ logger = logging.getLogger(__name__)
19
+
20
  def search_images(query, num_images=5):
21
+ logger.info(f"Searching for images with query: {query}")
22
  # Headers to mimic a browser request
23
  headers = {
24
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
 
53
  break
54
 
55
  # Skip small thumbnails, icons, and low-quality images
56
+ if ('gstatic.com' in img_url or
57
+ 'google.com' in img_url or
58
+ 'icon' in img_url.lower() or
59
  'thumb' in img_url.lower() or
60
  'small' in img_url.lower()):
61
  continue
 
72
  })
73
 
74
  except Exception as e:
75
+ logger.error(f"Error checking image URL: {str(e)}")
76
  continue
77
 
78
  # Add a small delay between checks
 
81
  return results
82
 
83
  except Exception as e:
84
+ logger.error(f"An error occurred: {str(e)}")
85
  return []
86
 
87
  def get_cover_image(query):
88
+ logger.info(f"Getting cover image for query: {query}")
89
  """Get a high-quality cover image URL for a given query"""
90
  try:
91
  # Search for images
92
  images = search_images(query, num_images=3) # Get top 3 images to choose from
93
+
94
  if not images:
95
  return None
96
+
97
  # Return the first valid image URL
98
  return images[0]['url']
99
+
100
  except Exception as e:
101
+ logger.error(f"Error getting cover image: {str(e)}")
102
  return None
103
 
104
  @app.route('/search_images', methods=['GET'])
 
130
  }), 500
131
 
132
  def scrape_site_content(query, num_sites=5):
133
+ logger.info(f"Scraping content for query: {query} from {num_sites} sites")
134
  headers = {
135
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
136
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 
148
  try:
149
  # Get more URLs than needed to account for failures
150
  search_results = list(search(query, num_results=num_sites * 2))
151
+
152
  # Process each found URL
153
  for url in search_results:
154
  if scraped >= num_sites:
 
158
  for attempt in range(retries):
159
  try:
160
  # Get the HTML content
161
+ logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
162
  response = requests.get(
163
+ url,
164
+ headers=headers,
165
  timeout=timeout,
166
  verify=False # Skip SSL verification
167
  )
 
170
  # Verify it's HTML content
171
  content_type = response.headers.get('Content-Type', '').lower()
172
  if 'text/html' not in content_type:
173
+ logger.warning(f"Skipping {url} - not HTML content")
174
  break
175
 
176
  # Parse the HTML content
 
182
 
183
  # Extract text content (limit to first 10000 characters)
184
  text_content = soup.get_text(separator='\n', strip=True)[:10000]
185
+
186
  # Skip if not enough content
187
  if len(text_content.split()) < 100: # Skip if less than 100 words
188
+ logger.warning(f"Skipping {url} - not enough content")
189
  break
190
 
191
  # Extract all links (limit to first 10)
 
227
  break # Break retry loop on success
228
 
229
  except requests.Timeout:
230
+ logger.warning(f"Timeout on {url} (attempt {attempt + 1}/{retries})")
231
  if attempt == retries - 1: # Last attempt
232
+ logger.error(f"Skipping {url} after {retries} timeout attempts")
233
  except requests.RequestException as e:
234
+ logger.error(f"Error scraping {url} (attempt {attempt + 1}/{retries}): {str(e)}")
235
  if attempt == retries - 1: # Last attempt
236
+ logger.error(f"Skipping {url} after {retries} failed attempts")
237
+
238
  # Add a longer delay between retries
239
  if not success and attempt < retries - 1:
240
  time.sleep(random.uniform(1, 2))
 
246
  return results
247
 
248
  except Exception as e:
249
+ logger.error(f"Error in search/scraping process: {str(e)}")
250
  # Return whatever results we've managed to gather
251
  return results
252
 
 
279
  }), 500
280
 
281
  def analyze_with_gpt(scraped_content, research_query, openrouter_key):
282
+ logger.info(f"Analyzing content with GPT for query: {research_query}")
283
  """Analyze scraped content using OpenRouter's Gemini model"""
284
  try:
285
  headers = {
 
325
 
326
  return response.json()['choices'][0]['message']['content']
327
  except Exception as e:
328
+ logger.error(f"Error in analyze_with_gpt: {str(e)}")
329
  return f"Error analyzing content: {str(e)}"
330
 
331
  def research_topic(query, num_sites=5, openrouter_key=None):
332
+ logger.info(f"Starting research for topic: {query}")
333
  """Research a topic using web scraping and GPT analysis"""
334
  try:
335
  # First get web content using existing scrape_site_content function
336
  scraped_results = scrape_site_content(query, num_sites)
337
+
338
  # Format scraped content for analysis
339
  formatted_content = []
340
  for result in scraped_results:
 
347
  'keywords': result['meta_keywords']
348
  }
349
  })
350
+
351
  # Get AI analysis of the scraped content
352
  analysis = analyze_with_gpt(formatted_content, query, openrouter_key) if openrouter_key else "No OpenRouter API key provided for analysis"
353
+
354
  return {
355
  'success': True,
356
  'query': query,