siddhartharya commited on
Commit
fe49b51
1 Parent(s): dcf746e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -222
app.py CHANGED
@@ -5,14 +5,13 @@ from bs4 import BeautifulSoup
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
8
- import asyncio
9
- import aiohttp
10
  import re
11
  import base64
12
  import logging
13
  import os
14
  import sys
15
- import time
16
 
17
  # Import OpenAI library
18
  import openai
@@ -71,23 +70,9 @@ GROQ_API_KEY = os.getenv('GROQ_API_KEY')
71
  if not GROQ_API_KEY:
72
  logger.error("GROQ_API_KEY environment variable not set.")
73
 
74
- # Set OpenAI API key and base URL to use Groq Cloud API
75
  openai.api_key = GROQ_API_KEY
76
  openai.api_base = "https://api.groq.com/openai/v1"
77
 
78
- def extract_retry_after(error_message):
79
- """
80
- Extract the retry-after time from the rate limit error message.
81
- """
82
- match = re.search(r'Please try again in (\d+\.?\d*)s', error_message)
83
- if match:
84
- return float(match.group(1)) + 1 # Add a buffer of 1 second
85
- else:
86
- return 5 # Default retry after 5 seconds
87
-
88
- def exponential_backoff(retries):
89
- return min(60, (2 ** retries)) # Cap the wait time at 60 seconds
90
-
91
  def extract_main_content(soup):
92
  """
93
  Extract the main content from a webpage while filtering out boilerplate content.
@@ -157,10 +142,6 @@ def get_page_metadata(soup):
157
 
158
  return metadata
159
 
160
- async def generate_summary_async(bookmark):
161
- async with llm_semaphore:
162
- await asyncio.get_event_loop().run_in_executor(None, generate_summary, bookmark)
163
-
164
  def generate_summary(bookmark):
165
  """
166
  Generate a concise summary for a bookmark using available content and LLM via the Groq Cloud API.
@@ -204,11 +185,11 @@ def generate_summary(bookmark):
204
  if use_prior_knowledge:
205
  # Construct prompt to use prior knowledge
206
  prompt = f"""
207
- You are a knowledgeable assistant.
208
 
209
  The user provided a URL: {bookmark.get('url')}
210
 
211
- Please provide a concise summary in **no more than two sentences** about this website based on your knowledge.
212
 
213
  Focus on:
214
  - The main purpose or topic of the website.
@@ -233,44 +214,24 @@ Be concise and objective.
233
  """
234
 
235
  # Call the LLM via Groq Cloud API
236
- retries = 0
237
- max_retries = 5
238
- while retries <= max_retries:
239
- try:
240
- response = openai.ChatCompletion.create(
241
- model='llama-3.1-70b-versatile',
242
- messages=[
243
- {"role": "user", "content": prompt}
244
- ],
245
- max_tokens=100, # Reduced max tokens
246
- temperature=0.5,
247
- )
248
- break # Exit loop if successful
249
- except openai.error.RateLimitError as e:
250
- retry_after = extract_retry_after(str(e)) or exponential_backoff(retries)
251
- logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
252
- time.sleep(retry_after)
253
- retries += 1
254
- except Exception as e:
255
- logger.error(f"Error generating summary: {e}", exc_info=True)
256
- bookmark['summary'] = 'No summary available.'
257
- return bookmark
258
-
259
  summary = response['choices'][0]['message']['content'].strip()
260
  if not summary:
261
  raise ValueError("Empty summary received from the model.")
262
  logger.info("Successfully generated LLM summary")
263
  bookmark['summary'] = summary
264
- return bookmark
265
 
266
  except Exception as e:
267
  logger.error(f"Error generating summary: {e}", exc_info=True)
268
  bookmark['summary'] = 'No summary available.'
269
- return bookmark
270
-
271
- async def assign_category_async(bookmark):
272
- async with llm_semaphore:
273
- await asyncio.get_event_loop().run_in_executor(None, assign_category, bookmark)
274
 
275
  def assign_category(bookmark):
276
  """
@@ -279,12 +240,12 @@ def assign_category(bookmark):
279
  if bookmark.get('dead_link'):
280
  bookmark['category'] = 'Dead Link'
281
  logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
282
- return bookmark
283
 
284
  summary = bookmark.get('summary', '')
285
  if not summary:
286
  bookmark['category'] = 'Uncategorized'
287
- return bookmark
288
 
289
  # Prepare the prompt
290
  categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
@@ -302,40 +263,28 @@ Categories:
302
  Respond with only the category name.
303
  """
304
 
305
- retries = 0
306
- max_retries = 5
307
- while retries <= max_retries:
308
- try:
309
- response = openai.ChatCompletion.create(
310
- model='llama-3.1-70b-versatile',
311
- messages=[
312
- {"role": "user", "content": prompt}
313
- ],
314
- max_tokens=10,
315
- temperature=0,
316
- )
317
- break # Exit loop if successful
318
- except openai.error.RateLimitError as e:
319
- retry_after = extract_retry_after(str(e)) or exponential_backoff(retries)
320
- logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
321
- time.sleep(retry_after)
322
- retries += 1
323
- except Exception as e:
324
- logger.error(f"Error assigning category: {e}", exc_info=True)
325
  bookmark['category'] = 'Uncategorized'
326
- return bookmark
327
-
328
- category = response['choices'][0]['message']['content'].strip().strip('"')
329
 
330
- # Validate the category
331
- if category in CATEGORIES:
332
- bookmark['category'] = category
333
- logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
334
- else:
335
  bookmark['category'] = 'Uncategorized'
336
- logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
337
-
338
- return bookmark
339
 
340
  def parse_bookmarks(file_content):
341
  """
@@ -356,109 +305,65 @@ def parse_bookmarks(file_content):
356
  logger.error("Error parsing bookmarks: %s", e, exc_info=True)
357
  raise
358
 
359
- async def fetch_url_info(session, bookmark):
360
  """
361
- Fetch information about a URL asynchronously.
362
  """
363
  url = bookmark['url']
364
  if url in fetch_cache:
365
  bookmark.update(fetch_cache[url])
366
- return bookmark
367
-
368
- max_retries = 0 # No retries
369
- retries = 0
370
- timeout_duration = 5 # Reduced timeout
371
-
372
- while retries <= max_retries:
373
- try:
374
- logger.info(f"Fetching URL info for: {url} (Attempt {retries + 1})")
375
- headers = {
376
- 'User-Agent': 'Mozilla/5.0',
377
- 'Accept-Language': 'en-US,en;q=0.9',
378
- }
379
- async with session.get(url, timeout=timeout_duration, headers=headers, ssl=False, allow_redirects=True) as response:
380
- bookmark['etag'] = response.headers.get('ETag', 'N/A')
381
- bookmark['status_code'] = response.status
382
-
383
- content = await response.text()
384
- logger.info(f"Fetched content length for {url}: {len(content)} characters")
385
-
386
- # Handle status codes
387
- if response.status >= 500:
388
- # Server error, consider as dead link
389
- bookmark['dead_link'] = True
390
- bookmark['description'] = ''
391
- bookmark['html_content'] = ''
392
- logger.warning(f"Dead link detected: {url} with status {response.status}")
393
- else:
394
- bookmark['dead_link'] = False
395
- bookmark['html_content'] = content
396
- bookmark['description'] = ''
397
- logger.info(f"Fetched information for {url}")
398
- break # Exit loop if successful
399
-
400
- except asyncio.exceptions.TimeoutError:
401
- bookmark['dead_link'] = False # Mark as 'Unknown' instead of 'Dead'
402
- bookmark['etag'] = 'N/A'
403
- bookmark['status_code'] = 'Timeout'
404
- bookmark['description'] = ''
405
- bookmark['html_content'] = ''
406
- bookmark['slow_link'] = True # Custom flag to indicate slow response
407
- logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
408
- break # Exit loop after timeout
409
- except Exception as e:
410
  bookmark['dead_link'] = True
411
- bookmark['etag'] = 'N/A'
412
- bookmark['status_code'] = 'Error'
413
  bookmark['description'] = ''
414
  bookmark['html_content'] = ''
415
- logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
416
- break
417
- finally:
418
- fetch_cache[url] = {
419
- 'etag': bookmark.get('etag'),
420
- 'status_code': bookmark.get('status_code'),
421
- 'dead_link': bookmark.get('dead_link'),
422
- 'description': bookmark.get('description'),
423
- 'html_content': bookmark.get('html_content', ''),
424
- 'slow_link': bookmark.get('slow_link', False),
425
- }
426
- return bookmark
427
-
428
- async def process_bookmarks_async(bookmarks_list):
429
- """
430
- Fetch all bookmarks asynchronously.
431
- """
432
- logger.info("Processing bookmarks asynchronously")
433
- try:
434
- connector = aiohttp.TCPConnector(limit=10) # Increase limit if necessary
435
- timeout = aiohttp.ClientTimeout(total=60) # Set timeout
436
- async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
437
- tasks = []
438
- for bookmark in bookmarks_list:
439
- task = asyncio.ensure_future(fetch_url_info(session, bookmark))
440
- tasks.append(task)
441
- await asyncio.gather(*tasks)
442
- logger.info("Completed processing bookmarks asynchronously")
443
  except Exception as e:
444
- logger.error(f"Error in asynchronous processing of bookmarks: {e}", exc_info=True)
445
- raise
446
-
447
- async def process_bookmarks_llm(bookmarks_list):
448
- """
449
- Process bookmarks asynchronously for LLM API calls.
450
- """
451
- logger.info("Processing bookmarks with LLM asynchronously")
452
- tasks = []
453
- for bookmark in bookmarks_list:
454
- tasks.append(generate_summary_async(bookmark))
455
- await asyncio.gather(*tasks)
456
-
457
- tasks = []
458
- for bookmark in bookmarks_list:
459
- tasks.append(assign_category_async(bookmark))
460
- await asyncio.gather(*tasks)
461
- logger.info("Completed LLM processing of bookmarks")
462
 
463
  def vectorize_and_index(bookmarks_list):
464
  """
@@ -489,16 +394,16 @@ def display_bookmarks():
489
  index = i + 1
490
  if bookmark.get('dead_link'):
491
  status = "❌ Dead Link"
492
- card_style = "border: 2px solid var(--error-color);"
493
- text_style = "color: var(--error-color);"
494
  elif bookmark.get('slow_link'):
495
  status = "⏳ Slow Response"
496
  card_style = "border: 2px solid orange;"
497
  text_style = "color: orange;"
498
  else:
499
  status = "✅ Active"
500
- card_style = "border: 2px solid var(--success-color);"
501
- text_style = "color: var(--text-color);"
502
 
503
  title = bookmark['title']
504
  url = bookmark['url']
@@ -559,19 +464,14 @@ def process_uploaded_file(file):
559
  for idx, bookmark in enumerate(bookmarks):
560
  bookmark['id'] = idx
561
 
562
- # Asynchronously fetch bookmark info
563
- try:
564
- asyncio.run(process_bookmarks_async(bookmarks))
565
- except Exception as e:
566
- logger.error(f"Error processing bookmarks asynchronously: {e}", exc_info=True)
567
- return "Error processing bookmarks.", '', gr.update(choices=[]), display_bookmarks()
568
 
569
- # Asynchronously process bookmarks with LLM
570
- try:
571
- asyncio.run(process_bookmarks_llm(bookmarks))
572
- except Exception as e:
573
- logger.error(f"Error processing bookmarks with LLM: {e}", exc_info=True)
574
- return "Error processing bookmarks with LLM.", '', gr.update(choices=[]), display_bookmarks()
575
 
576
  try:
577
  faiss_index = vectorize_and_index(bookmarks)
@@ -642,7 +542,7 @@ def edit_selected_bookmarks_category(selected_indices, new_category):
642
 
643
  # Update choices and display
644
  choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
645
- for i, bookmark in enumerate(bookmarks)]
646
 
647
  return message, gr.update(choices=choices), display_bookmarks()
648
 
@@ -718,31 +618,17 @@ Bookmarks:
718
  Provide a concise and helpful response.
719
  """
720
 
721
- retries = 0
722
- max_retries = 5
723
- while retries <= max_retries:
724
- try:
725
- response = openai.ChatCompletion.create(
726
- model='llama-3.1-70b-versatile',
727
- messages=[
728
- {"role": "user", "content": prompt}
729
- ],
730
- max_tokens=500,
731
- temperature=0.7,
732
- )
733
- break # Exit loop if successful
734
- except openai.error.RateLimitError as e:
735
- retry_after = extract_retry_after(str(e)) or exponential_backoff(retries)
736
- logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
737
- time.sleep(retry_after)
738
- retries += 1
739
- except Exception as e:
740
- error_message = f"⚠️ Error processing your query: {str(e)}"
741
- logger.error(error_message, exc_info=True)
742
- return error_message
743
-
744
  answer = response['choices'][0]['message']['content'].strip()
745
- logger.info("Chatbot response generated using Groq Cloud API")
 
746
  return answer
747
 
748
  except Exception as e:
@@ -868,6 +754,4 @@ def build_app():
868
  print(f"Error building the app: {e}")
869
 
870
  if __name__ == "__main__":
871
- # Define a semaphore to limit concurrent LLM API calls
872
- llm_semaphore = asyncio.Semaphore(3) # Adjust based on allowed concurrency
873
  build_app()
 
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
8
+ import requests
9
+ import time
10
  import re
11
  import base64
12
  import logging
13
  import os
14
  import sys
 
15
 
16
  # Import OpenAI library
17
  import openai
 
70
  if not GROQ_API_KEY:
71
  logger.error("GROQ_API_KEY environment variable not set.")
72
 
 
73
  openai.api_key = GROQ_API_KEY
74
  openai.api_base = "https://api.groq.com/openai/v1"
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def extract_main_content(soup):
77
  """
78
  Extract the main content from a webpage while filtering out boilerplate content.
 
142
 
143
  return metadata
144
 
 
 
 
 
145
  def generate_summary(bookmark):
146
  """
147
  Generate a concise summary for a bookmark using available content and LLM via the Groq Cloud API.
 
185
  if use_prior_knowledge:
186
  # Construct prompt to use prior knowledge
187
  prompt = f"""
188
+ You are a knowledgeable assistant with up-to-date information as of 2023.
189
 
190
  The user provided a URL: {bookmark.get('url')}
191
 
192
+ Please provide a concise summary in **no more than two sentences** about this website.
193
 
194
  Focus on:
195
  - The main purpose or topic of the website.
 
214
  """
215
 
216
  # Call the LLM via Groq Cloud API
217
+ response = openai.ChatCompletion.create(
218
+ model='llama-3.1-70b-versatile',
219
+ messages=[
220
+ {"role": "user", "content": prompt}
221
+ ],
222
+ max_tokens=100,
223
+ temperature=0.5,
224
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  summary = response['choices'][0]['message']['content'].strip()
226
  if not summary:
227
  raise ValueError("Empty summary received from the model.")
228
  logger.info("Successfully generated LLM summary")
229
  bookmark['summary'] = summary
230
+ time.sleep(3) # Wait to respect rate limits
231
 
232
  except Exception as e:
233
  logger.error(f"Error generating summary: {e}", exc_info=True)
234
  bookmark['summary'] = 'No summary available.'
 
 
 
 
 
235
 
236
  def assign_category(bookmark):
237
  """
 
240
  if bookmark.get('dead_link'):
241
  bookmark['category'] = 'Dead Link'
242
  logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
243
+ return
244
 
245
  summary = bookmark.get('summary', '')
246
  if not summary:
247
  bookmark['category'] = 'Uncategorized'
248
+ return
249
 
250
  # Prepare the prompt
251
  categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
 
263
  Respond with only the category name.
264
  """
265
 
266
+ try:
267
+ response = openai.ChatCompletion.create(
268
+ model='llama-3.1-70b-versatile',
269
+ messages=[
270
+ {"role": "user", "content": prompt}
271
+ ],
272
+ max_tokens=10,
273
+ temperature=0,
274
+ )
275
+ category = response['choices'][0]['message']['content'].strip().strip('"')
276
+ # Validate the category
277
+ if category in CATEGORIES:
278
+ bookmark['category'] = category
279
+ logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
280
+ else:
 
 
 
 
 
281
  bookmark['category'] = 'Uncategorized'
282
+ logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
283
+ time.sleep(3) # Wait to respect rate limits
 
284
 
285
+ except Exception as e:
286
+ logger.error(f"Error assigning category: {e}", exc_info=True)
 
 
 
287
  bookmark['category'] = 'Uncategorized'
 
 
 
288
 
289
  def parse_bookmarks(file_content):
290
  """
 
305
  logger.error("Error parsing bookmarks: %s", e, exc_info=True)
306
  raise
307
 
308
+ def fetch_url_info(bookmark):
309
  """
310
+ Fetch information about a URL.
311
  """
312
  url = bookmark['url']
313
  if url in fetch_cache:
314
  bookmark.update(fetch_cache[url])
315
+ return
316
+
317
+ try:
318
+ logger.info(f"Fetching URL info for: {url}")
319
+ headers = {
320
+ 'User-Agent': 'Mozilla/5.0',
321
+ 'Accept-Language': 'en-US,en;q=0.9',
322
+ }
323
+ response = requests.get(url, headers=headers, timeout=5, verify=False, allow_redirects=True)
324
+ bookmark['etag'] = response.headers.get('ETag', 'N/A')
325
+ bookmark['status_code'] = response.status_code
326
+
327
+ content = response.text
328
+ logger.info(f"Fetched content length for {url}: {len(content)} characters")
329
+
330
+ # Handle status codes
331
+ if response.status_code >= 500:
332
+ # Server error, consider as dead link
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  bookmark['dead_link'] = True
 
 
334
  bookmark['description'] = ''
335
  bookmark['html_content'] = ''
336
+ logger.warning(f"Dead link detected: {url} with status {response.status_code}")
337
+ else:
338
+ bookmark['dead_link'] = False
339
+ bookmark['html_content'] = content
340
+ bookmark['description'] = ''
341
+ logger.info(f"Fetched information for {url}")
342
+
343
+ except requests.exceptions.Timeout:
344
+ bookmark['dead_link'] = False # Mark as 'Unknown' instead of 'Dead'
345
+ bookmark['etag'] = 'N/A'
346
+ bookmark['status_code'] = 'Timeout'
347
+ bookmark['description'] = ''
348
+ bookmark['html_content'] = ''
349
+ bookmark['slow_link'] = True # Custom flag to indicate slow response
350
+ logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  except Exception as e:
352
+ bookmark['dead_link'] = True
353
+ bookmark['etag'] = 'N/A'
354
+ bookmark['status_code'] = 'Error'
355
+ bookmark['description'] = ''
356
+ bookmark['html_content'] = ''
357
+ logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
358
+ finally:
359
+ fetch_cache[url] = {
360
+ 'etag': bookmark.get('etag'),
361
+ 'status_code': bookmark.get('status_code'),
362
+ 'dead_link': bookmark.get('dead_link'),
363
+ 'description': bookmark.get('description'),
364
+ 'html_content': bookmark.get('html_content', ''),
365
+ 'slow_link': bookmark.get('slow_link', False),
366
+ }
 
 
 
367
 
368
  def vectorize_and_index(bookmarks_list):
369
  """
 
394
  index = i + 1
395
  if bookmark.get('dead_link'):
396
  status = "❌ Dead Link"
397
+ card_style = "border: 2px solid red;"
398
+ text_style = "color: red;"
399
  elif bookmark.get('slow_link'):
400
  status = "⏳ Slow Response"
401
  card_style = "border: 2px solid orange;"
402
  text_style = "color: orange;"
403
  else:
404
  status = "✅ Active"
405
+ card_style = "border: 2px solid green;"
406
+ text_style = "color: black;"
407
 
408
  title = bookmark['title']
409
  url = bookmark['url']
 
464
  for idx, bookmark in enumerate(bookmarks):
465
  bookmark['id'] = idx
466
 
467
+ # Fetch bookmark info sequentially
468
+ for bookmark in bookmarks:
469
+ fetch_url_info(bookmark)
 
 
 
470
 
471
+ # Process bookmarks sequentially with LLM
472
+ for bookmark in bookmarks:
473
+ generate_summary(bookmark)
474
+ assign_category(bookmark)
 
 
475
 
476
  try:
477
  faiss_index = vectorize_and_index(bookmarks)
 
542
 
543
  # Update choices and display
544
  choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
545
+ for i, bookmark in enumerate(bookmarks)]
546
 
547
  return message, gr.update(choices=choices), display_bookmarks()
548
 
 
618
  Provide a concise and helpful response.
619
  """
620
 
621
+ response = openai.ChatCompletion.create(
622
+ model='llama-3.1-70b-versatile',
623
+ messages=[
624
+ {"role": "user", "content": prompt}
625
+ ],
626
+ max_tokens=500,
627
+ temperature=0.7,
628
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  answer = response['choices'][0]['message']['content'].strip()
630
+ logger.info("Chatbot response generated")
631
+ time.sleep(3) # Wait to respect rate limits
632
  return answer
633
 
634
  except Exception as e:
 
754
  print(f"Error building the app: {e}")
755
 
756
  if __name__ == "__main__":
 
 
757
  build_app()