Spaces:
Running
Running
siddhartharya
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -8,18 +8,17 @@ import numpy as np
|
|
8 |
import requests
|
9 |
import time
|
10 |
import re
|
11 |
-
import base64
|
12 |
import logging
|
13 |
import os
|
14 |
import sys
|
15 |
-
import concurrent.futures
|
16 |
from concurrent.futures import ThreadPoolExecutor
|
17 |
import threading
|
|
|
18 |
|
19 |
# Import OpenAI library
|
20 |
import openai
|
21 |
|
22 |
-
# Suppress
|
23 |
import urllib3
|
24 |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
25 |
|
@@ -74,14 +73,15 @@ CATEGORIES = [
|
|
74 |
"Uncategorized",
|
75 |
]
|
76 |
|
77 |
-
# Set up
|
78 |
-
|
79 |
|
80 |
-
if not
|
81 |
-
logger.error("
|
82 |
|
83 |
-
openai.api_key =
|
84 |
-
|
|
|
85 |
|
86 |
# Initialize global variables for rate limiting
|
87 |
api_lock = threading.Lock()
|
@@ -178,7 +178,7 @@ def generate_summary_and_assign_category(bookmark):
|
|
178 |
time.sleep(sleep_duration)
|
179 |
last_api_call_time = time.time()
|
180 |
|
181 |
-
#
|
182 |
html_content = bookmark.get('html_content', '')
|
183 |
soup = BeautifulSoup(html_content, 'html.parser')
|
184 |
metadata = get_page_metadata(soup)
|
@@ -208,7 +208,7 @@ def generate_summary_and_assign_category(bookmark):
|
|
208 |
else:
|
209 |
use_prior_knowledge = False
|
210 |
|
211 |
-
#
|
212 |
if use_prior_knowledge:
|
213 |
prompt = f"""
|
214 |
You are a knowledgeable assistant with up-to-date information as of 2023.
|
@@ -237,27 +237,13 @@ Summary: [Your summary]
|
|
237 |
Category: [One category]
|
238 |
"""
|
239 |
|
240 |
-
#
|
241 |
-
def estimate_tokens(text):
|
242 |
-
return len(text) / 4 # Approximate token estimation
|
243 |
-
|
244 |
-
prompt_tokens = estimate_tokens(prompt)
|
245 |
-
max_tokens = 150 # Adjusted from 200
|
246 |
-
total_tokens = prompt_tokens + max_tokens
|
247 |
-
|
248 |
-
# Calculate required delay
|
249 |
-
tokens_per_minute = 40000
|
250 |
-
tokens_per_second = tokens_per_minute / 60
|
251 |
-
required_delay = total_tokens / tokens_per_second
|
252 |
-
sleep_time = max(required_delay, 2) # Ensure at least 2 seconds
|
253 |
-
|
254 |
-
# Call the LLM via Groq Cloud API
|
255 |
response = openai.ChatCompletion.create(
|
256 |
-
model='
|
257 |
messages=[
|
258 |
{"role": "user", "content": prompt}
|
259 |
],
|
260 |
-
max_tokens=
|
261 |
temperature=0.5,
|
262 |
)
|
263 |
|
@@ -283,7 +269,7 @@ Category: [One category]
|
|
283 |
else:
|
284 |
bookmark['category'] = 'Uncategorized'
|
285 |
|
286 |
-
# Simple keyword-based validation
|
287 |
summary_lower = bookmark['summary'].lower()
|
288 |
url_lower = bookmark['url'].lower()
|
289 |
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
@@ -292,7 +278,6 @@ Category: [One category]
|
|
292 |
bookmark['category'] = 'Reference and Knowledge Bases'
|
293 |
|
294 |
logger.info("Successfully generated summary and assigned category")
|
295 |
-
time.sleep(sleep_time)
|
296 |
break # Exit the retry loop upon success
|
297 |
|
298 |
except openai.error.RateLimitError as e:
|
@@ -439,7 +424,6 @@ def display_bookmarks():
|
|
439 |
category = bookmark.get('category', 'Uncategorized')
|
440 |
|
441 |
# Escape HTML content to prevent XSS attacks
|
442 |
-
from html import escape
|
443 |
title = escape(title)
|
444 |
url = escape(url)
|
445 |
summary = escape(summary)
|
@@ -493,12 +477,12 @@ def process_uploaded_file(file, state_bookmarks):
|
|
493 |
|
494 |
# Fetch bookmark info concurrently
|
495 |
logger.info("Fetching URL info concurrently")
|
496 |
-
with ThreadPoolExecutor(max_workers=10) as executor: #
|
497 |
executor.map(fetch_url_info, bookmarks)
|
498 |
|
499 |
# Process bookmarks concurrently with LLM calls
|
500 |
logger.info("Processing bookmarks with LLM concurrently")
|
501 |
-
with ThreadPoolExecutor(max_workers=1) as executor: #
|
502 |
executor.map(generate_summary_and_assign_category, bookmarks)
|
503 |
|
504 |
try:
|
@@ -531,12 +515,15 @@ def delete_selected_bookmarks(selected_indices, state_bookmarks):
|
|
531 |
ids_to_delete = []
|
532 |
indices_to_delete = []
|
533 |
for s in selected_indices:
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
|
|
|
|
|
|
540 |
|
541 |
# Remove vectors from FAISS index
|
542 |
if faiss_index is not None and ids_to_delete:
|
@@ -565,11 +552,20 @@ def edit_selected_bookmarks_category(selected_indices, new_category, state_bookm
|
|
565 |
if not new_category:
|
566 |
return "⚠️ No new category selected.", gr.CheckboxGroup.update(choices=[]), display_bookmarks(), state_bookmarks
|
567 |
|
568 |
-
indices = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
for idx in indices:
|
570 |
-
|
571 |
-
|
572 |
-
logger.info(f"Updated category for bookmark {idx + 1} to {new_category}")
|
573 |
|
574 |
message = "✏️ Category updated for selected bookmarks."
|
575 |
logger.info(message)
|
@@ -589,7 +585,7 @@ def export_bookmarks():
|
|
589 |
"""
|
590 |
if not bookmarks:
|
591 |
logger.warning("No bookmarks to export")
|
592 |
-
return None # Return None
|
593 |
|
594 |
try:
|
595 |
logger.info("Exporting bookmarks to HTML")
|
@@ -639,7 +635,7 @@ def chatbot_response(user_query, chat_history):
|
|
639 |
time.sleep(sleep_duration)
|
640 |
last_api_call_time = time.time()
|
641 |
|
642 |
-
#
|
643 |
query_vector = embedding_model.encode([user_query]).astype('float32')
|
644 |
k = 5 # Number of results to return
|
645 |
distances, ids = faiss_index.search(query_vector, k)
|
@@ -660,7 +656,7 @@ def chatbot_response(user_query, chat_history):
|
|
660 |
for bookmark in matching_bookmarks
|
661 |
])
|
662 |
|
663 |
-
#
|
664 |
prompt = f"""
|
665 |
A user asked: "{user_query}"
|
666 |
Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks.
|
@@ -669,33 +665,18 @@ Bookmarks:
|
|
669 |
Provide a concise and helpful response.
|
670 |
"""
|
671 |
|
672 |
-
#
|
673 |
-
def estimate_tokens(text):
|
674 |
-
return len(text) / 4 # Approximate token estimation
|
675 |
-
|
676 |
-
prompt_tokens = estimate_tokens(prompt)
|
677 |
-
max_tokens = 300 # Adjust as needed
|
678 |
-
total_tokens = prompt_tokens + max_tokens
|
679 |
-
|
680 |
-
# Calculate required delay
|
681 |
-
tokens_per_minute = 40000
|
682 |
-
tokens_per_second = tokens_per_minute / 60
|
683 |
-
required_delay = total_tokens / tokens_per_second
|
684 |
-
sleep_time = max(required_delay, 2) # Ensure at least 2 seconds
|
685 |
-
|
686 |
-
# Call the LLM via Groq Cloud API
|
687 |
response = openai.ChatCompletion.create(
|
688 |
-
model='
|
689 |
messages=[
|
690 |
{"role": "user", "content": prompt}
|
691 |
],
|
692 |
-
max_tokens=
|
693 |
temperature=0.7,
|
694 |
)
|
695 |
|
696 |
answer = response['choices'][0]['message']['content'].strip()
|
697 |
logger.info("Chatbot response generated")
|
698 |
-
time.sleep(sleep_time)
|
699 |
|
700 |
# Append the assistant's response to chat history
|
701 |
chat_history.append({"role": "assistant", "content": answer})
|
|
|
8 |
import requests
|
9 |
import time
|
10 |
import re
|
|
|
11 |
import logging
|
12 |
import os
|
13 |
import sys
|
|
|
14 |
from concurrent.futures import ThreadPoolExecutor
|
15 |
import threading
|
16 |
+
from html import escape
|
17 |
|
18 |
# Import OpenAI library
|
19 |
import openai
|
20 |
|
21 |
+
# Suppress specific warnings
|
22 |
import urllib3
|
23 |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
24 |
|
|
|
73 |
"Uncategorized",
|
74 |
]
|
75 |
|
76 |
+
# Set up OpenAI API key and base URL
|
77 |
+
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
78 |
|
79 |
+
if not OPENAI_API_KEY:
|
80 |
+
logger.error("OPENAI_API_KEY environment variable not set.")
|
81 |
|
82 |
+
openai.api_key = OPENAI_API_KEY
|
83 |
+
# If you're using a custom API base, uncomment and set it
|
84 |
+
# openai.api_base = "https://api.your-provider.com/v1"
|
85 |
|
86 |
# Initialize global variables for rate limiting
|
87 |
api_lock = threading.Lock()
|
|
|
178 |
time.sleep(sleep_duration)
|
179 |
last_api_call_time = time.time()
|
180 |
|
181 |
+
# Prepare the prompt
|
182 |
html_content = bookmark.get('html_content', '')
|
183 |
soup = BeautifulSoup(html_content, 'html.parser')
|
184 |
metadata = get_page_metadata(soup)
|
|
|
208 |
else:
|
209 |
use_prior_knowledge = False
|
210 |
|
211 |
+
# Craft the prompt based on content availability
|
212 |
if use_prior_knowledge:
|
213 |
prompt = f"""
|
214 |
You are a knowledgeable assistant with up-to-date information as of 2023.
|
|
|
237 |
Category: [One category]
|
238 |
"""
|
239 |
|
240 |
+
# Call the LLM via OpenAI API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
response = openai.ChatCompletion.create(
|
242 |
+
model='gpt-4', # Ensure you're using a valid and accessible model
|
243 |
messages=[
|
244 |
{"role": "user", "content": prompt}
|
245 |
],
|
246 |
+
max_tokens=150,
|
247 |
temperature=0.5,
|
248 |
)
|
249 |
|
|
|
269 |
else:
|
270 |
bookmark['category'] = 'Uncategorized'
|
271 |
|
272 |
+
# Optional: Simple keyword-based validation
|
273 |
summary_lower = bookmark['summary'].lower()
|
274 |
url_lower = bookmark['url'].lower()
|
275 |
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
|
|
278 |
bookmark['category'] = 'Reference and Knowledge Bases'
|
279 |
|
280 |
logger.info("Successfully generated summary and assigned category")
|
|
|
281 |
break # Exit the retry loop upon success
|
282 |
|
283 |
except openai.error.RateLimitError as e:
|
|
|
424 |
category = bookmark.get('category', 'Uncategorized')
|
425 |
|
426 |
# Escape HTML content to prevent XSS attacks
|
|
|
427 |
title = escape(title)
|
428 |
url = escape(url)
|
429 |
summary = escape(summary)
|
|
|
477 |
|
478 |
# Fetch bookmark info concurrently
|
479 |
logger.info("Fetching URL info concurrently")
|
480 |
+
with ThreadPoolExecutor(max_workers=10) as executor: # Adjust max_workers as needed
|
481 |
executor.map(fetch_url_info, bookmarks)
|
482 |
|
483 |
# Process bookmarks concurrently with LLM calls
|
484 |
logger.info("Processing bookmarks with LLM concurrently")
|
485 |
+
with ThreadPoolExecutor(max_workers=1) as executor: # Serialize API calls to respect rate limits
|
486 |
executor.map(generate_summary_and_assign_category, bookmarks)
|
487 |
|
488 |
try:
|
|
|
515 |
ids_to_delete = []
|
516 |
indices_to_delete = []
|
517 |
for s in selected_indices:
|
518 |
+
try:
|
519 |
+
idx = int(s.split('.')[0]) - 1
|
520 |
+
if 0 <= idx < len(bookmarks):
|
521 |
+
bookmark_id = bookmarks[idx]['id']
|
522 |
+
ids_to_delete.append(bookmark_id)
|
523 |
+
indices_to_delete.append(idx)
|
524 |
+
logger.info(f"Deleting bookmark at index {idx + 1}")
|
525 |
+
except (ValueError, IndexError):
|
526 |
+
logger.warning(f"Invalid selection format: {s}")
|
527 |
|
528 |
# Remove vectors from FAISS index
|
529 |
if faiss_index is not None and ids_to_delete:
|
|
|
552 |
if not new_category:
|
553 |
return "⚠️ No new category selected.", gr.CheckboxGroup.update(choices=[]), display_bookmarks(), state_bookmarks
|
554 |
|
555 |
+
indices = []
|
556 |
+
for s in selected_indices:
|
557 |
+
try:
|
558 |
+
idx = int(s.split('.')[0])-1
|
559 |
+
if 0 <= idx < len(bookmarks):
|
560 |
+
indices.append(idx)
|
561 |
+
else:
|
562 |
+
logger.warning(f"Index out of range: {idx + 1}")
|
563 |
+
except ValueError:
|
564 |
+
logger.warning(f"Invalid selection format: {s}")
|
565 |
+
|
566 |
for idx in indices:
|
567 |
+
bookmarks[idx]['category'] = new_category
|
568 |
+
logger.info(f"Updated category for bookmark {idx + 1} to {new_category}")
|
|
|
569 |
|
570 |
message = "✏️ Category updated for selected bookmarks."
|
571 |
logger.info(message)
|
|
|
585 |
"""
|
586 |
if not bookmarks:
|
587 |
logger.warning("No bookmarks to export")
|
588 |
+
return None # Return None to indicate no file
|
589 |
|
590 |
try:
|
591 |
logger.info("Exporting bookmarks to HTML")
|
|
|
635 |
time.sleep(sleep_duration)
|
636 |
last_api_call_time = time.time()
|
637 |
|
638 |
+
# Encode the query and search the FAISS index
|
639 |
query_vector = embedding_model.encode([user_query]).astype('float32')
|
640 |
k = 5 # Number of results to return
|
641 |
distances, ids = faiss_index.search(query_vector, k)
|
|
|
656 |
for bookmark in matching_bookmarks
|
657 |
])
|
658 |
|
659 |
+
# Craft the prompt for the LLM
|
660 |
prompt = f"""
|
661 |
A user asked: "{user_query}"
|
662 |
Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks.
|
|
|
665 |
Provide a concise and helpful response.
|
666 |
"""
|
667 |
|
668 |
+
# Call the LLM via OpenAI API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
669 |
response = openai.ChatCompletion.create(
|
670 |
+
model='gpt-4', # Ensure you're using a valid and accessible model
|
671 |
messages=[
|
672 |
{"role": "user", "content": prompt}
|
673 |
],
|
674 |
+
max_tokens=300,
|
675 |
temperature=0.7,
|
676 |
)
|
677 |
|
678 |
answer = response['choices'][0]['message']['content'].strip()
|
679 |
logger.info("Chatbot response generated")
|
|
|
680 |
|
681 |
# Append the assistant's response to chat history
|
682 |
chat_history.append({"role": "assistant", "content": answer})
|