siddhartharya commited on
Commit
e44b0c3
1 Parent(s): fb6f5e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -84
app.py CHANGED
@@ -12,6 +12,7 @@ import base64
12
  import logging
13
  import os
14
  import sys
 
15
 
16
  # Import OpenAI library
17
  import openai
@@ -74,6 +75,16 @@ if not GROQ_API_KEY:
74
  openai.api_key = GROQ_API_KEY
75
  openai.api_base = "https://api.groq.com/openai/v1"
76
 
 
 
 
 
 
 
 
 
 
 
77
  def extract_main_content(soup):
78
  """
79
  Extract the main content from a webpage while filtering out boilerplate content.
@@ -173,10 +184,13 @@ def generate_summary(bookmark):
173
  content_text = '\n'.join(content_parts)
174
 
175
  # Detect insufficient or erroneous content
176
- error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic', 'Page Not Found', '404 Not Found', 'Forbidden']
177
- if not content_text or len(content_text.split()) < 50 or any(keyword.lower() in content_text.lower() for keyword in error_keywords):
 
 
 
178
  use_prior_knowledge = True
179
- logger.info(f"Content for {bookmark.get('url')} is insufficient or contains error messages. Instructing LLM to use prior knowledge.")
180
  else:
181
  use_prior_knowledge = False
182
 
@@ -205,8 +219,6 @@ Analyze the following webpage content:
205
 
206
  {content_text}
207
 
208
- If the content is insufficient or seems to be an error page, please use your own knowledge to provide an accurate summary.
209
-
210
  Provide a concise summary (2-3 sentences) focusing on:
211
  - The main purpose or topic of the page.
212
  - Key information or features.
@@ -216,14 +228,25 @@ Be factual and objective.
216
  """
217
 
218
  # Call the LLM via Groq Cloud API
219
- response = openai.ChatCompletion.create(
220
- model='llama-3.1-70b-versatile',
221
- messages=[
222
- {"role": "user", "content": prompt}
223
- ],
224
- max_tokens=200,
225
- temperature=0.5,
226
- )
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  summary = response['choices'][0]['message']['content'].strip()
229
  if not summary:
@@ -265,49 +288,64 @@ async def fetch_url_info(session, bookmark):
265
  bookmark.update(fetch_cache[url])
266
  return bookmark
267
 
268
- try:
269
- logger.info(f"Fetching URL info for: {url}")
270
- headers = {
271
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
272
- 'AppleWebKit/537.36 (KHTML, like Gecko) '
273
- 'Chrome/91.0.4472.124 Safari/537.36',
274
- 'Accept-Language': 'en-US,en;q=0.9',
275
- }
276
- async with session.get(url, timeout=20, headers=headers, ssl=False, allow_redirects=True) as response:
277
- bookmark['etag'] = response.headers.get('ETag', 'N/A')
278
- bookmark['status_code'] = response.status
279
-
280
- content = await response.text()
281
- logger.info(f"Fetched content length for {url}: {len(content)} characters")
282
-
283
- # Handle status codes
284
- if response.status >= 500:
285
- # Server error, consider as dead link
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  bookmark['dead_link'] = True
 
 
287
  bookmark['description'] = ''
288
  bookmark['html_content'] = ''
289
- logger.warning(f"Dead link detected: {url} with status {response.status}")
290
- else:
291
- bookmark['dead_link'] = False
292
- bookmark['html_content'] = content
293
- bookmark['description'] = ''
294
- logger.info(f"Fetched information for {url}")
295
-
296
- except Exception as e:
297
- bookmark['dead_link'] = True
298
- bookmark['etag'] = 'N/A'
299
- bookmark['status_code'] = 'N/A'
300
- bookmark['description'] = ''
301
- bookmark['html_content'] = ''
302
- logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
303
- finally:
304
- fetch_cache[url] = {
305
- 'etag': bookmark.get('etag'),
306
- 'status_code': bookmark.get('status_code'),
307
- 'dead_link': bookmark.get('dead_link'),
308
- 'description': bookmark.get('description'),
309
- 'html_content': bookmark.get('html_content', ''),
310
- }
311
  return bookmark
312
 
313
  async def process_bookmarks_async(bookmarks_list):
@@ -317,7 +355,7 @@ async def process_bookmarks_async(bookmarks_list):
317
  logger.info("Processing bookmarks asynchronously")
318
  try:
319
  connector = aiohttp.TCPConnector(limit=5) # Limit concurrent connections
320
- timeout = aiohttp.ClientTimeout(total=30) # Set timeout
321
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
322
  tasks = []
323
  for bookmark in bookmarks_list:
@@ -359,32 +397,37 @@ Categories:
359
  Respond with only the category name.
360
  """
361
 
362
- try:
363
- response = openai.ChatCompletion.create(
364
- model='llama-3.1-70b-versatile',
365
- messages=[
366
- {"role": "user", "content": prompt}
367
- ],
368
- max_tokens=10,
369
- temperature=0,
370
- )
371
-
372
- category = response['choices'][0]['message']['content'].strip().strip('"')
373
-
374
- # Validate the category
375
- if category in CATEGORIES:
376
- bookmark['category'] = category
377
- logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
378
- else:
379
  bookmark['category'] = 'Uncategorized'
380
- logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
381
 
382
- return bookmark
383
 
384
- except Exception as e:
385
- logger.error(f"Error assigning category: {e}", exc_info=True)
 
 
 
386
  bookmark['category'] = 'Uncategorized'
387
- return bookmark
 
 
388
 
389
  def vectorize_and_index(bookmarks_list):
390
  """
@@ -637,14 +680,25 @@ Bookmarks:
637
  Provide a concise and helpful response.
638
  """
639
 
640
- response = openai.ChatCompletion.create(
641
- model='llama-3.1-70b-versatile',
642
- messages=[
643
- {"role": "user", "content": prompt}
644
- ],
645
- max_tokens=500,
646
- temperature=0.7,
647
- )
 
 
 
 
 
 
 
 
 
 
 
648
 
649
  answer = response['choices'][0]['message']['content'].strip()
650
  logger.info("Chatbot response generated using Groq Cloud API")
 
12
  import logging
13
  import os
14
  import sys
15
+ import time
16
 
17
  # Import OpenAI library
18
  import openai
 
75
  openai.api_key = GROQ_API_KEY
76
  openai.api_base = "https://api.groq.com/openai/v1"
77
 
78
+ def extract_retry_after(error_message):
79
+ """
80
+ Extract the retry-after time from the rate limit error message.
81
+ """
82
+ match = re.search(r'Please try again in (\d+\.?\d*)s', error_message)
83
+ if match:
84
+ return float(match.group(1)) + 1 # Add a buffer of 1 second
85
+ else:
86
+ return 5 # Default retry after 5 seconds
87
+
88
  def extract_main_content(soup):
89
  """
90
  Extract the main content from a webpage while filtering out boilerplate content.
 
184
  content_text = '\n'.join(content_parts)
185
 
186
  # Detect insufficient or erroneous content
187
+ error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
188
+ if not content_text or len(content_text.split()) < 50:
189
+ use_prior_knowledge = True
190
+ logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
191
+ elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
192
  use_prior_knowledge = True
193
+ logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
194
  else:
195
  use_prior_knowledge = False
196
 
 
219
 
220
  {content_text}
221
 
 
 
222
  Provide a concise summary (2-3 sentences) focusing on:
223
  - The main purpose or topic of the page.
224
  - Key information or features.
 
228
  """
229
 
230
  # Call the LLM via Groq Cloud API
231
+ while True:
232
+ try:
233
+ response = openai.ChatCompletion.create(
234
+ model='llama-3.1-70b-versatile',
235
+ messages=[
236
+ {"role": "user", "content": prompt}
237
+ ],
238
+ max_tokens=200,
239
+ temperature=0.5,
240
+ )
241
+ break # Exit loop if successful
242
+ except openai.error.RateLimitError as e:
243
+ retry_after = extract_retry_after(str(e))
244
+ logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
245
+ time.sleep(retry_after)
246
+ except Exception as e:
247
+ logger.error(f"Error generating summary: {e}", exc_info=True)
248
+ bookmark['summary'] = 'No summary available.'
249
+ return bookmark
250
 
251
  summary = response['choices'][0]['message']['content'].strip()
252
  if not summary:
 
288
  bookmark.update(fetch_cache[url])
289
  return bookmark
290
 
291
+ max_retries = 3
292
+ retries = 0
293
+ while retries < max_retries:
294
+ try:
295
+ logger.info(f"Fetching URL info for: {url} (Attempt {retries + 1})")
296
+ headers = {
297
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
298
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
299
+ 'Chrome/91.0.4472.124 Safari/537.36',
300
+ 'Accept-Language': 'en-US,en;q=0.9',
301
+ }
302
+ async with session.get(url, timeout=60, headers=headers, ssl=False, allow_redirects=True) as response:
303
+ bookmark['etag'] = response.headers.get('ETag', 'N/A')
304
+ bookmark['status_code'] = response.status
305
+
306
+ content = await response.text()
307
+ logger.info(f"Fetched content length for {url}: {len(content)} characters")
308
+
309
+ # Handle status codes
310
+ if response.status >= 500:
311
+ # Server error, consider as dead link
312
+ bookmark['dead_link'] = True
313
+ bookmark['description'] = ''
314
+ bookmark['html_content'] = ''
315
+ logger.warning(f"Dead link detected: {url} with status {response.status}")
316
+ else:
317
+ bookmark['dead_link'] = False
318
+ bookmark['html_content'] = content
319
+ bookmark['description'] = ''
320
+ logger.info(f"Fetched information for {url}")
321
+ break # Exit loop if successful
322
+
323
+ except asyncio.exceptions.TimeoutError:
324
+ retries += 1
325
+ logger.warning(f"Timeout while fetching {url}. Retrying ({retries}/{max_retries})...")
326
+ if retries == max_retries:
327
  bookmark['dead_link'] = True
328
+ bookmark['etag'] = 'N/A'
329
+ bookmark['status_code'] = 'Timeout'
330
  bookmark['description'] = ''
331
  bookmark['html_content'] = ''
332
+ logger.error(f"Max retries reached for {url}. Marking as dead link.")
333
+ except Exception as e:
334
+ bookmark['dead_link'] = True
335
+ bookmark['etag'] = 'N/A'
336
+ bookmark['status_code'] = 'Error'
337
+ bookmark['description'] = ''
338
+ bookmark['html_content'] = ''
339
+ logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
340
+ break
341
+ finally:
342
+ fetch_cache[url] = {
343
+ 'etag': bookmark.get('etag'),
344
+ 'status_code': bookmark.get('status_code'),
345
+ 'dead_link': bookmark.get('dead_link'),
346
+ 'description': bookmark.get('description'),
347
+ 'html_content': bookmark.get('html_content', ''),
348
+ }
 
 
 
 
 
349
  return bookmark
350
 
351
  async def process_bookmarks_async(bookmarks_list):
 
355
  logger.info("Processing bookmarks asynchronously")
356
  try:
357
  connector = aiohttp.TCPConnector(limit=5) # Limit concurrent connections
358
+ timeout = aiohttp.ClientTimeout(total=60) # Set timeout
359
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
360
  tasks = []
361
  for bookmark in bookmarks_list:
 
397
  Respond with only the category name.
398
  """
399
 
400
+ while True:
401
+ try:
402
+ response = openai.ChatCompletion.create(
403
+ model='llama-3.1-70b-versatile',
404
+ messages=[
405
+ {"role": "user", "content": prompt}
406
+ ],
407
+ max_tokens=10,
408
+ temperature=0,
409
+ )
410
+ break # Exit loop if successful
411
+ except openai.error.RateLimitError as e:
412
+ retry_after = extract_retry_after(str(e))
413
+ logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
414
+ time.sleep(retry_after)
415
+ except Exception as e:
416
+ logger.error(f"Error assigning category: {e}", exc_info=True)
417
  bookmark['category'] = 'Uncategorized'
418
+ return bookmark
419
 
420
+ category = response['choices'][0]['message']['content'].strip().strip('"')
421
 
422
+ # Validate the category
423
+ if category in CATEGORIES:
424
+ bookmark['category'] = category
425
+ logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
426
+ else:
427
  bookmark['category'] = 'Uncategorized'
428
+ logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
429
+
430
+ return bookmark
431
 
432
  def vectorize_and_index(bookmarks_list):
433
  """
 
680
  Provide a concise and helpful response.
681
  """
682
 
683
+ while True:
684
+ try:
685
+ response = openai.ChatCompletion.create(
686
+ model='llama-3.1-70b-versatile',
687
+ messages=[
688
+ {"role": "user", "content": prompt}
689
+ ],
690
+ max_tokens=500,
691
+ temperature=0.7,
692
+ )
693
+ break # Exit loop if successful
694
+ except openai.error.RateLimitError as e:
695
+ retry_after = extract_retry_after(str(e))
696
+ logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
697
+ time.sleep(retry_after)
698
+ except Exception as e:
699
+ error_message = f"⚠️ Error processing your query: {str(e)}"
700
+ logger.error(error_message, exc_info=True)
701
+ return error_message
702
 
703
  answer = response['choices'][0]['message']['content'].strip()
704
  logger.info("Chatbot response generated using Groq Cloud API")