siddhartharya commited on
Commit
8f32801
·
verified ·
1 Parent(s): 1dbb950

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -239
app.py CHANGED
@@ -85,6 +85,9 @@ def extract_retry_after(error_message):
85
  else:
86
  return 5 # Default retry after 5 seconds
87
 
 
 
 
88
  def extract_main_content(soup):
89
  """
90
  Extract the main content from a webpage while filtering out boilerplate content.
@@ -154,6 +157,10 @@ def get_page_metadata(soup):
154
 
155
  return metadata
156
 
 
 
 
 
157
  def generate_summary(bookmark):
158
  """
159
  Generate a concise summary for a bookmark using available content and LLM via the Groq Cloud API.
@@ -226,7 +233,9 @@ Be concise and objective.
226
  """
227
 
228
  # Call the LLM via Groq Cloud API
229
- while True:
 
 
230
  try:
231
  response = openai.ChatCompletion.create(
232
  model='llama-3.1-70b-versatile',
@@ -238,9 +247,10 @@ Be concise and objective.
238
  )
239
  break # Exit loop if successful
240
  except openai.error.RateLimitError as e:
241
- retry_after = extract_retry_after(str(e))
242
  logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
243
  time.sleep(retry_after)
 
244
  except Exception as e:
245
  logger.error(f"Error generating summary: {e}", exc_info=True)
246
  bookmark['summary'] = 'No summary available.'
@@ -258,6 +268,75 @@ Be concise and objective.
258
  bookmark['summary'] = 'No summary available.'
259
  return bookmark
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  def parse_bookmarks(file_content):
262
  """
263
  Parse bookmarks from HTML file.
@@ -286,17 +365,15 @@ async def fetch_url_info(session, bookmark):
286
  bookmark.update(fetch_cache[url])
287
  return bookmark
288
 
289
- max_retries = 1
290
  retries = 0
291
- timeout_duration = 15 # Reduced timeout
292
 
293
  while retries <= max_retries:
294
  try:
295
  logger.info(f"Fetching URL info for: {url} (Attempt {retries + 1})")
296
  headers = {
297
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
298
- 'AppleWebKit/537.36 (KHTML, like Gecko) '
299
- 'Chrome/91.0.4472.124 Safari/537.36',
300
  'Accept-Language': 'en-US,en;q=0.9',
301
  }
302
  async with session.get(url, timeout=timeout_duration, headers=headers, ssl=False, allow_redirects=True) as response:
@@ -321,15 +398,14 @@ async def fetch_url_info(session, bookmark):
321
  break # Exit loop if successful
322
 
323
  except asyncio.exceptions.TimeoutError:
324
- retries += 1
325
- if retries > max_retries:
326
- bookmark['dead_link'] = False # Mark as 'Unknown' instead of 'Dead'
327
- bookmark['etag'] = 'N/A'
328
- bookmark['status_code'] = 'Timeout'
329
- bookmark['description'] = ''
330
- bookmark['html_content'] = ''
331
- bookmark['slow_link'] = True # Custom flag to indicate slow response
332
- logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
333
  except Exception as e:
334
  bookmark['dead_link'] = True
335
  bookmark['etag'] = 'N/A'
@@ -355,7 +431,7 @@ async def process_bookmarks_async(bookmarks_list):
355
  """
356
  logger.info("Processing bookmarks asynchronously")
357
  try:
358
- connector = aiohttp.TCPConnector(limit=5) # Limit concurrent connections
359
  timeout = aiohttp.ClientTimeout(total=60) # Set timeout
360
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
361
  tasks = []
@@ -368,67 +444,17 @@ async def process_bookmarks_async(bookmarks_list):
368
  logger.error(f"Error in asynchronous processing of bookmarks: {e}", exc_info=True)
369
  raise
370
 
371
- def assign_category(bookmark):
372
  """
373
- Assign a category to a bookmark using the LLM based on its summary via the Groq Cloud API.
374
  """
375
- if bookmark.get('dead_link'):
376
- bookmark['category'] = 'Dead Link'
377
- logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
378
- return bookmark
379
-
380
- summary = bookmark.get('summary', '')
381
- if not summary:
382
- bookmark['category'] = 'Uncategorized'
383
- return bookmark
384
-
385
- # Prepare the prompt
386
- categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
387
- prompt = f"""
388
- You are a helpful assistant that categorizes webpages.
389
-
390
- Based on the following summary, assign the most appropriate category from the list below.
391
-
392
- Summary:
393
- {summary}
394
-
395
- Categories:
396
- {categories_str}
397
-
398
- Respond with only the category name.
399
- """
400
-
401
- while True:
402
- try:
403
- response = openai.ChatCompletion.create(
404
- model='llama-3.1-70b-versatile',
405
- messages=[
406
- {"role": "user", "content": prompt}
407
- ],
408
- max_tokens=10,
409
- temperature=0,
410
- )
411
- break # Exit loop if successful
412
- except openai.error.RateLimitError as e:
413
- retry_after = extract_retry_after(str(e))
414
- logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
415
- time.sleep(retry_after)
416
- except Exception as e:
417
- logger.error(f"Error assigning category: {e}", exc_info=True)
418
- bookmark['category'] = 'Uncategorized'
419
- return bookmark
420
-
421
- category = response['choices'][0]['message']['content'].strip().strip('"')
422
-
423
- # Validate the category
424
- if category in CATEGORIES:
425
- bookmark['category'] = category
426
- logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
427
- else:
428
- bookmark['category'] = 'Uncategorized'
429
- logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
430
-
431
- return bookmark
432
 
433
  def vectorize_and_index(bookmarks_list):
434
  """
@@ -536,12 +562,12 @@ def process_uploaded_file(file):
536
  logger.error(f"Error processing bookmarks asynchronously: {e}", exc_info=True)
537
  return "Error processing bookmarks.", '', gr.update(choices=[]), display_bookmarks()
538
 
539
- # Process bookmarks sequentially
540
- for bookmark in bookmarks:
541
- generate_summary(bookmark)
542
- time.sleep(0.5)
543
- assign_category(bookmark)
544
- time.sleep(0.5)
545
 
546
  try:
547
  faiss_index = vectorize_and_index(bookmarks)
@@ -559,164 +585,9 @@ def process_uploaded_file(file):
559
 
560
  return message, bookmark_html, gr.update(choices=choices), bookmark_html
561
 
562
- def delete_selected_bookmarks(selected_indices):
563
- """
564
- Delete selected bookmarks and remove their vectors from the FAISS index.
565
- """
566
- global bookmarks, faiss_index
567
- if not selected_indices:
568
- return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
569
-
570
- ids_to_delete = []
571
- indices_to_delete = []
572
- for s in selected_indices:
573
- idx = int(s.split('.')[0]) - 1
574
- if 0 <= idx < len(bookmarks):
575
- bookmark_id = bookmarks[idx]['id']
576
- ids_to_delete.append(bookmark_id)
577
- indices_to_delete.append(idx)
578
- logger.info(f"Deleting bookmark at index {idx + 1}")
579
-
580
- # Remove vectors from FAISS index
581
- if faiss_index is not None and ids_to_delete:
582
- faiss_index.remove_ids(np.array(ids_to_delete, dtype=np.int64))
583
-
584
- # Remove bookmarks from the list (reverse order to avoid index shifting)
585
- for idx in sorted(indices_to_delete, reverse=True):
586
- bookmarks.pop(idx)
587
-
588
- message = "🗑️ Selected bookmarks deleted successfully."
589
- logger.info(message)
590
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
591
- for i, bookmark in enumerate(bookmarks)]
592
-
593
- return message, gr.update(choices=choices), display_bookmarks()
594
-
595
- def edit_selected_bookmarks_category(selected_indices, new_category):
596
- """
597
- Edit category of selected bookmarks.
598
- """
599
- if not selected_indices:
600
- return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
601
- if not new_category:
602
- return "⚠️ No new category selected.", gr.update(choices=[]), display_bookmarks()
603
-
604
- indices = [int(s.split('.')[0])-1 for s in selected_indices]
605
- for idx in indices:
606
- if 0 <= idx < len(bookmarks):
607
- bookmarks[idx]['category'] = new_category
608
- logger.info(f"Updated category for bookmark {idx + 1} to {new_category}")
609
-
610
- message = "✏️ Category updated for selected bookmarks."
611
- logger.info(message)
612
-
613
- # Update choices and display
614
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
615
- for i, bookmark in enumerate(bookmarks)]
616
-
617
- return message, gr.update(choices=choices), display_bookmarks()
618
-
619
- def export_bookmarks():
620
- """
621
- Export bookmarks to HTML file.
622
- """
623
- if not bookmarks:
624
- logger.warning("No bookmarks to export")
625
- return "⚠️ No bookmarks to export."
626
-
627
- try:
628
- logger.info("Exporting bookmarks to HTML")
629
- soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1><Title>Bookmarks</Title><H1>Bookmarks</H1>", 'html.parser')
630
- dl = soup.new_tag('DL')
631
- for bookmark in bookmarks:
632
- dt = soup.new_tag('DT')
633
- a = soup.new_tag('A', href=bookmark['url'])
634
- a.string = bookmark['title']
635
- dt.append(a)
636
- dl.append(dt)
637
- soup.append(dl)
638
- html_content = str(soup)
639
- b64 = base64.b64encode(html_content.encode()).decode()
640
- href = f'data:text/html;base64,{b64}'
641
- logger.info("Bookmarks exported successfully")
642
- return f'<a href="{href}" download="bookmarks.html">💾 Download Exported Bookmarks</a>'
643
- except Exception as e:
644
- logger.error(f"Error exporting bookmarks: {e}", exc_info=True)
645
- return "⚠️ Error exporting bookmarks."
646
-
647
- def chatbot_response(user_query):
648
- """
649
- Generate chatbot response using the FAISS index and embeddings.
650
- """
651
- if not bookmarks or faiss_index is None:
652
- logger.warning("No bookmarks available for chatbot")
653
- return "⚠️ No bookmarks available. Please upload and process your bookmarks first."
654
-
655
- logger.info(f"Chatbot received query: {user_query}")
656
-
657
- try:
658
- # Encode the user query
659
- query_vector = embedding_model.encode([user_query]).astype('float32')
660
-
661
- # Search the FAISS index
662
- k = 5 # Number of results to return
663
- distances, ids = faiss_index.search(query_vector, k)
664
- ids = ids.flatten()
665
-
666
- # Retrieve the bookmarks
667
- id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
668
- matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
669
-
670
- if not matching_bookmarks:
671
- return "No relevant bookmarks found for your query."
672
-
673
- # Format the response
674
- bookmarks_info = "\n".join([
675
- f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}"
676
- for bookmark in matching_bookmarks
677
- ])
678
-
679
- # Use the LLM via Groq Cloud API to generate a response
680
- prompt = f"""
681
- A user asked: "{user_query}"
682
-
683
- Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks.
684
-
685
- Bookmarks:
686
- {bookmarks_info}
687
-
688
- Provide a concise and helpful response.
689
- """
690
-
691
- while True:
692
- try:
693
- response = openai.ChatCompletion.create(
694
- model='llama-3.1-70b-versatile',
695
- messages=[
696
- {"role": "user", "content": prompt}
697
- ],
698
- max_tokens=500,
699
- temperature=0.7,
700
- )
701
- break # Exit loop if successful
702
- except openai.error.RateLimitError as e:
703
- retry_after = extract_retry_after(str(e))
704
- logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
705
- time.sleep(retry_after)
706
- except Exception as e:
707
- error_message = f"⚠️ Error processing your query: {str(e)}"
708
- logger.error(error_message, exc_info=True)
709
- return error_message
710
-
711
- answer = response['choices'][0]['message']['content'].strip()
712
- logger.info("Chatbot response generated using Groq Cloud API")
713
- return answer
714
-
715
- except Exception as e:
716
- error_message = f"⚠️ Error processing your query: {str(e)}"
717
- logger.error(error_message, exc_info=True)
718
- return error_message
719
 
 
720
  def build_app():
721
  """
722
  Build and launch the Gradio app.
@@ -835,4 +706,6 @@ def build_app():
835
  print(f"Error building the app: {e}")
836
 
837
  if __name__ == "__main__":
 
 
838
  build_app()
 
85
  else:
86
  return 5 # Default retry after 5 seconds
87
 
88
+ def exponential_backoff(retries):
89
+ return min(60, (2 ** retries)) # Cap the wait time at 60 seconds
90
+
91
  def extract_main_content(soup):
92
  """
93
  Extract the main content from a webpage while filtering out boilerplate content.
 
157
 
158
  return metadata
159
 
160
+ async def generate_summary_async(bookmark):
161
+ async with llm_semaphore:
162
+ generate_summary(bookmark)
163
+
164
  def generate_summary(bookmark):
165
  """
166
  Generate a concise summary for a bookmark using available content and LLM via the Groq Cloud API.
 
233
  """
234
 
235
  # Call the LLM via Groq Cloud API
236
+ retries = 0
237
+ max_retries = 5
238
+ while retries <= max_retries:
239
  try:
240
  response = openai.ChatCompletion.create(
241
  model='llama-3.1-70b-versatile',
 
247
  )
248
  break # Exit loop if successful
249
  except openai.error.RateLimitError as e:
250
+ retry_after = extract_retry_after(str(e)) or exponential_backoff(retries)
251
  logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
252
  time.sleep(retry_after)
253
+ retries += 1
254
  except Exception as e:
255
  logger.error(f"Error generating summary: {e}", exc_info=True)
256
  bookmark['summary'] = 'No summary available.'
 
268
  bookmark['summary'] = 'No summary available.'
269
  return bookmark
270
 
271
+ async def assign_category_async(bookmark):
272
+ async with llm_semaphore:
273
+ assign_category(bookmark)
274
+
275
+ def assign_category(bookmark):
276
+ """
277
+ Assign a category to a bookmark using the LLM based on its summary via the Groq Cloud API.
278
+ """
279
+ if bookmark.get('dead_link'):
280
+ bookmark['category'] = 'Dead Link'
281
+ logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
282
+ return bookmark
283
+
284
+ summary = bookmark.get('summary', '')
285
+ if not summary:
286
+ bookmark['category'] = 'Uncategorized'
287
+ return bookmark
288
+
289
+ # Prepare the prompt
290
+ categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
291
+ prompt = f"""
292
+ You are a helpful assistant that categorizes webpages.
293
+
294
+ Based on the following summary, assign the most appropriate category from the list below.
295
+
296
+ Summary:
297
+ {summary}
298
+
299
+ Categories:
300
+ {categories_str}
301
+
302
+ Respond with only the category name.
303
+ """
304
+
305
+ retries = 0
306
+ max_retries = 5
307
+ while retries <= max_retries:
308
+ try:
309
+ response = openai.ChatCompletion.create(
310
+ model='llama-3.1-70b-versatile',
311
+ messages=[
312
+ {"role": "user", "content": prompt}
313
+ ],
314
+ max_tokens=10,
315
+ temperature=0,
316
+ )
317
+ break # Exit loop if successful
318
+ except openai.error.RateLimitError as e:
319
+ retry_after = extract_retry_after(str(e)) or exponential_backoff(retries)
320
+ logger.warning(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
321
+ time.sleep(retry_after)
322
+ retries += 1
323
+ except Exception as e:
324
+ logger.error(f"Error assigning category: {e}", exc_info=True)
325
+ bookmark['category'] = 'Uncategorized'
326
+ return bookmark
327
+
328
+ category = response['choices'][0]['message']['content'].strip().strip('"')
329
+
330
+ # Validate the category
331
+ if category in CATEGORIES:
332
+ bookmark['category'] = category
333
+ logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
334
+ else:
335
+ bookmark['category'] = 'Uncategorized'
336
+ logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
337
+
338
+ return bookmark
339
+
340
  def parse_bookmarks(file_content):
341
  """
342
  Parse bookmarks from HTML file.
 
365
  bookmark.update(fetch_cache[url])
366
  return bookmark
367
 
368
+ max_retries = 0 # No retries
369
  retries = 0
370
+ timeout_duration = 5 # Reduced timeout
371
 
372
  while retries <= max_retries:
373
  try:
374
  logger.info(f"Fetching URL info for: {url} (Attempt {retries + 1})")
375
  headers = {
376
+ 'User-Agent': 'Mozilla/5.0',
 
 
377
  'Accept-Language': 'en-US,en;q=0.9',
378
  }
379
  async with session.get(url, timeout=timeout_duration, headers=headers, ssl=False, allow_redirects=True) as response:
 
398
  break # Exit loop if successful
399
 
400
  except asyncio.exceptions.TimeoutError:
401
+ bookmark['dead_link'] = False # Mark as 'Unknown' instead of 'Dead'
402
+ bookmark['etag'] = 'N/A'
403
+ bookmark['status_code'] = 'Timeout'
404
+ bookmark['description'] = ''
405
+ bookmark['html_content'] = ''
406
+ bookmark['slow_link'] = True # Custom flag to indicate slow response
407
+ logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
408
+ break # Exit loop after timeout
 
409
  except Exception as e:
410
  bookmark['dead_link'] = True
411
  bookmark['etag'] = 'N/A'
 
431
  """
432
  logger.info("Processing bookmarks asynchronously")
433
  try:
434
+ connector = aiohttp.TCPConnector(limit=10) # Increase limit if necessary
435
  timeout = aiohttp.ClientTimeout(total=60) # Set timeout
436
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
437
  tasks = []
 
444
  logger.error(f"Error in asynchronous processing of bookmarks: {e}", exc_info=True)
445
  raise
446
 
447
+ async def process_bookmarks_llm(bookmarks_list):
448
  """
449
+ Process bookmarks asynchronously for LLM API calls.
450
  """
451
+ logger.info("Processing bookmarks with LLM asynchronously")
452
+ tasks = []
453
+ for bookmark in bookmarks_list:
454
+ tasks.append(generate_summary_async(bookmark))
455
+ tasks.append(assign_category_async(bookmark))
456
+ await asyncio.gather(*tasks)
457
+ logger.info("Completed LLM processing of bookmarks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
  def vectorize_and_index(bookmarks_list):
460
  """
 
562
  logger.error(f"Error processing bookmarks asynchronously: {e}", exc_info=True)
563
  return "Error processing bookmarks.", '', gr.update(choices=[]), display_bookmarks()
564
 
565
+ # Asynchronously process bookmarks with LLM
566
+ try:
567
+ asyncio.run(process_bookmarks_llm(bookmarks))
568
+ except Exception as e:
569
+ logger.error(f"Error processing bookmarks with LLM: {e}", exc_info=True)
570
+ return "Error processing bookmarks with LLM.", '', gr.update(choices=[]), display_bookmarks()
571
 
572
  try:
573
  faiss_index = vectorize_and_index(bookmarks)
 
585
 
586
  return message, bookmark_html, gr.update(choices=choices), bookmark_html
587
 
588
+ # The rest of the code remains unchanged (e.g., delete_selected_bookmarks, edit_selected_bookmarks_category, etc.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
+ # Build and launch the Gradio app
591
  def build_app():
592
  """
593
  Build and launch the Gradio app.
 
706
  print(f"Error building the app: {e}")
707
 
708
  if __name__ == "__main__":
709
+ # Define a semaphore to limit concurrent LLM API calls
710
+ llm_semaphore = asyncio.Semaphore(3) # Adjust based on allowed concurrency
711
  build_app()