Spaces:
Running
Running
# app.py | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import requests | |
import time | |
import re | |
import logging | |
import os | |
import sys | |
import threading | |
from queue import Queue, Empty | |
import json | |
from concurrent.futures import ThreadPoolExecutor | |
# Import OpenAI library | |
import openai | |
# Suppress only the single warning from urllib3 needed. | |
import urllib3 | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
# Set up logging to output to the console | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
# Create a console handler | |
console_handler = logging.StreamHandler(sys.stdout) | |
console_handler.setLevel(logging.INFO) | |
# Create a formatter and set it for the handler | |
formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s') | |
console_handler.setFormatter(formatter) | |
# Add the handler to the logger | |
logger.addHandler(console_handler) | |
# Initialize variables and models | |
logger.info("Initializing variables and models") | |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
faiss_index = None | |
bookmarks = [] | |
fetch_cache = {} | |
# Lock for thread-safe operations | |
lock = threading.Lock() | |
# Define the categories | |
CATEGORIES = [ | |
"Social Media", | |
"News and Media", | |
"Education and Learning", | |
"Entertainment", | |
"Shopping and E-commerce", | |
"Finance and Banking", | |
"Technology", | |
"Health and Fitness", | |
"Travel and Tourism", | |
"Food and Recipes", | |
"Sports", | |
"Arts and Culture", | |
"Government and Politics", | |
"Business and Economy", | |
"Science and Research", | |
"Personal Blogs and Journals", | |
"Job Search and Careers", | |
"Music and Audio", | |
"Videos and Movies", | |
"Reference and Knowledge Bases", | |
"Dead Link", | |
"Uncategorized", | |
] | |
# Set up Groq Cloud API key and base URL | |
GROQ_API_KEY = os.getenv('GROQ_API_KEY') | |
if not GROQ_API_KEY: | |
logger.error("GROQ_API_KEY environment variable not set.") | |
openai.api_key = GROQ_API_KEY | |
openai.api_base = "https://api.groq.com/openai/v1" | |
# Rate Limiter Configuration | |
RPM_LIMIT = 60 # Requests per minute (adjust based on your API's limit) | |
TPM_LIMIT = 60000 # Tokens per minute (adjust based on your API's limit) | |
BATCH_SIZE = 5 # Number of bookmarks per batch | |
# Implementing a Token Bucket Rate Limiter | |
class TokenBucket: | |
def __init__(self, rate, capacity): | |
self.rate = rate # tokens per second | |
self.capacity = capacity | |
self.tokens = capacity | |
self.timestamp = time.time() | |
self.lock = threading.Lock() | |
def consume(self, tokens=1): | |
with self.lock: | |
now = time.time() | |
elapsed = now - self.timestamp | |
refill = elapsed * self.rate | |
self.tokens = min(self.capacity, self.tokens + refill) | |
self.timestamp = now | |
if self.tokens >= tokens: | |
self.tokens -= tokens | |
return True | |
else: | |
return False | |
def wait_for_token(self, tokens=1): | |
while not self.consume(tokens): | |
time.sleep(0.05) | |
# Initialize rate limiters | |
rpm_rate = RPM_LIMIT / 60 # tokens per second | |
tpm_rate = TPM_LIMIT / 60 # tokens per second | |
rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT) | |
tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT) | |
# Queue for LLM tasks | |
llm_queue = Queue() | |
def categorize_based_on_summary(summary, url): | |
""" | |
Assign category based on keywords in the summary or URL. | |
""" | |
summary_lower = summary.lower() | |
url_lower = url.lower() | |
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower: | |
return 'Social Media' | |
elif 'wikipedia' in url_lower: | |
return 'Reference and Knowledge Bases' | |
elif 'cloud computing' in summary_lower or 'aws' in summary_lower: | |
return 'Technology' | |
elif 'news' in summary_lower or 'media' in summary_lower: | |
return 'News and Media' | |
elif 'education' in summary_lower or 'learning' in summary_lower: | |
return 'Education and Learning' | |
# Add more conditions as needed | |
else: | |
return 'Uncategorized' | |
def validate_category(bookmark): | |
""" | |
Further validate and adjust the category if needed. | |
""" | |
# Example: Specific cases based on URL | |
url_lower = bookmark['url'].lower() | |
if 'facebook' in url_lower or 'x.com' in url_lower: | |
return 'Social Media' | |
elif 'wikipedia' in url_lower: | |
return 'Reference and Knowledge Bases' | |
elif 'aws.amazon.com' in url_lower: | |
return 'Technology' | |
# Add more specific cases as needed | |
else: | |
return bookmark['category'] | |
def extract_main_content(soup): | |
""" | |
Extract the main content from a webpage while filtering out boilerplate content. | |
""" | |
if not soup: | |
return "" | |
# Remove unwanted elements | |
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']): | |
element.decompose() | |
# Extract text from <p> tags | |
p_tags = soup.find_all('p') | |
if p_tags: | |
content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags]) | |
else: | |
# Fallback to body content | |
content = soup.get_text(separator=' ', strip=True) | |
# Clean up the text | |
content = re.sub(r'\s+', ' ', content) | |
# Truncate content to a reasonable length (e.g., 1500 words) | |
words = content.split() | |
if len(words) > 1500: | |
content = ' '.join(words[:1500]) | |
return content | |
def get_page_metadata(soup): | |
""" | |
Extract metadata from the webpage including title, description, and keywords. | |
""" | |
metadata = { | |
'title': '', | |
'description': '', | |
'keywords': '' | |
} | |
if not soup: | |
return metadata | |
# Get title | |
title_tag = soup.find('title') | |
if title_tag and title_tag.string: | |
metadata['title'] = title_tag.string.strip() | |
# Get meta description | |
meta_desc = ( | |
soup.find('meta', attrs={'name': 'description'}) or | |
soup.find('meta', attrs={'property': 'og:description'}) or | |
soup.find('meta', attrs={'name': 'twitter:description'}) | |
) | |
if meta_desc: | |
metadata['description'] = meta_desc.get('content', '').strip() | |
# Get meta keywords | |
meta_keywords = soup.find('meta', attrs={'name': 'keywords'}) | |
if meta_keywords: | |
metadata['keywords'] = meta_keywords.get('content', '').strip() | |
# Get OG title if main title is empty | |
if not metadata['title']: | |
og_title = soup.find('meta', attrs={'property': 'og:title'}) | |
if og_title: | |
metadata['title'] = og_title.get('content', '').strip() | |
return metadata | |
def llm_worker(): | |
""" | |
Worker thread to process LLM tasks from the queue while respecting rate limits. | |
""" | |
logger.info("LLM worker started.") | |
while True: | |
batch = [] | |
try: | |
# Collect bookmarks up to BATCH_SIZE | |
while len(batch) < BATCH_SIZE: | |
bookmark = llm_queue.get(timeout=1) | |
if bookmark is None: | |
# Shutdown signal | |
logger.info("LLM worker shutting down.") | |
return | |
if not bookmark.get('dead_link') and not bookmark.get('slow_link'): | |
batch.append(bookmark) | |
else: | |
# Skip processing for dead or slow links | |
bookmark['summary'] = 'No summary available.' | |
bookmark['category'] = 'Uncategorized' | |
llm_queue.task_done() | |
except Empty: | |
pass # No more bookmarks at the moment | |
if batch: | |
try: | |
# Rate Limiting | |
rpm_bucket.wait_for_token() | |
# Estimate tokens: prompt + max_tokens | |
# Here, we assume max_tokens=150 per bookmark | |
total_tokens = 150 * len(batch) | |
tpm_bucket.wait_for_token(tokens=total_tokens) | |
# Prepare prompt | |
prompt = "You are an assistant that creates concise webpage summaries and assigns categories.\n\n" | |
prompt += "Provide summaries and categories for the following bookmarks:\n\n" | |
for idx, bookmark in enumerate(batch, 1): | |
prompt += f"Bookmark {idx}:\nURL: {bookmark['url']}\nTitle: {bookmark['title']}\n\n" | |
# Corrected f-string without backslashes | |
prompt += f"Categories:\n{', '.join([f'\"{cat}\"' for cat in CATEGORIES])}\n\n" | |
prompt += "Format your response as a JSON object where each key is the bookmark URL and the value is another JSON object containing 'summary' and 'category'.\n\n" | |
prompt += "Example:\n" | |
prompt += "{\n" | |
prompt += " \"https://example.com\": {\n" | |
prompt += " \"summary\": \"This is an example summary.\",\n" | |
prompt += " \"category\": \"Technology\"\n" | |
prompt += " }\n" | |
prompt += "}\n\n" | |
prompt += "Now, provide the summaries and categories for the bookmarks listed above." | |
response = openai.ChatCompletion.create( | |
model='llama-3.1-70b-versatile', # Ensure this model is correct and available | |
messages=[ | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=150 * len(batch), | |
temperature=0.5, | |
) | |
content = response['choices'][0]['message']['content'].strip() | |
if not content: | |
raise ValueError("Empty response received from the model.") | |
# Parse JSON response | |
try: | |
json_response = json.loads(content) | |
for bookmark in batch: | |
url = bookmark['url'] | |
if url in json_response: | |
summary = json_response[url].get('summary', '').strip() | |
category = json_response[url].get('category', '').strip() | |
if not summary: | |
summary = 'No summary available.' | |
bookmark['summary'] = summary | |
if category in CATEGORIES: | |
bookmark['category'] = category | |
else: | |
# Fallback to keyword-based categorization | |
bookmark['category'] = categorize_based_on_summary(summary, url) | |
else: | |
logger.warning(f"No data returned for {url}. Using fallback methods.") | |
bookmark['summary'] = 'No summary available.' | |
bookmark['category'] = 'Uncategorized' | |
# Additional keyword-based validation | |
bookmark['category'] = validate_category(bookmark) | |
logger.info(f"Processed bookmark: {url}") | |
except json.JSONDecodeError: | |
logger.error("Failed to parse JSON response from LLM. Using fallback methods.") | |
for bookmark in batch: | |
bookmark['summary'] = 'No summary available.' | |
bookmark['category'] = categorize_based_on_summary(bookmark.get('summary', ''), bookmark['url']) | |
bookmark['category'] = validate_category(bookmark) | |
except Exception as e: | |
logger.error(f"Error processing LLM response: {e}", exc_info=True) | |
for bookmark in batch: | |
bookmark['summary'] = 'No summary available.' | |
bookmark['category'] = 'Uncategorized' | |
except openai.error.RateLimitError as e: | |
logger.warning(f"LLM Rate limit reached. Retrying after 60 seconds.") | |
# Re-enqueue the entire batch for retry | |
for bookmark in batch: | |
llm_queue.put(bookmark) | |
time.sleep(60) # Wait before retrying | |
continue # Skip the rest and retry | |
except Exception as e: | |
logger.error(f"Error during LLM processing: {e}", exc_info=True) | |
for bookmark in batch: | |
bookmark['summary'] = 'No summary available.' | |
bookmark['category'] = 'Uncategorized' | |
finally: | |
# Mark all bookmarks in the batch as done | |
for _ in batch: | |
llm_queue.task_done() | |
def categorize_based_on_summary(summary, url): | |
""" | |
Assign category based on keywords in the summary or URL. | |
""" | |
summary_lower = summary.lower() | |
url_lower = url.lower() | |
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower: | |
return 'Social Media' | |
elif 'wikipedia' in url_lower: | |
return 'Reference and Knowledge Bases' | |
elif 'cloud computing' in summary_lower or 'aws' in summary_lower: | |
return 'Technology' | |
elif 'news' in summary_lower or 'media' in summary_lower: | |
return 'News and Media' | |
elif 'education' in summary_lower or 'learning' in summary_lower: | |
return 'Education and Learning' | |
# Add more conditions as needed | |
else: | |
return 'Uncategorized' | |
def validate_category(bookmark): | |
""" | |
Further validate and adjust the category if needed. | |
""" | |
# Example: Specific cases based on URL | |
url_lower = bookmark['url'].lower() | |
if 'facebook' in url_lower or 'x.com' in url_lower: | |
return 'Social Media' | |
elif 'wikipedia' in url_lower: | |
return 'Reference and Knowledge Bases' | |
elif 'aws.amazon.com' in url_lower: | |
return 'Technology' | |
# Add more specific cases as needed | |
else: | |
return bookmark['category'] | |
def fetch_url_info(bookmark): | |
""" | |
Fetch information about a URL. | |
""" | |
url = bookmark['url'] | |
if url in fetch_cache: | |
with lock: | |
bookmark.update(fetch_cache[url]) | |
return | |
try: | |
logger.info(f"Fetching URL info for: {url}") | |
headers = { | |
'User-Agent': 'Mozilla/5.0', | |
'Accept-Language': 'en-US,en;q=0.9', | |
} | |
response = requests.get(url, headers=headers, timeout=5, verify=False, allow_redirects=True) | |
bookmark['etag'] = response.headers.get('ETag', 'N/A') | |
bookmark['status_code'] = response.status_code | |
content = response.text | |
logger.info(f"Fetched content length for {url}: {len(content)} characters") | |
if response.status_code >= 500: | |
bookmark['dead_link'] = True | |
bookmark['description'] = '' | |
bookmark['html_content'] = '' | |
logger.warning(f"Dead link detected: {url} with status {response.status_code}") | |
else: | |
bookmark['dead_link'] = False | |
bookmark['html_content'] = content | |
bookmark['description'] = '' | |
logger.info(f"Fetched information for {url}") | |
except requests.exceptions.Timeout: | |
bookmark['dead_link'] = False | |
bookmark['etag'] = 'N/A' | |
bookmark['status_code'] = 'Timeout' | |
bookmark['description'] = '' | |
bookmark['html_content'] = '' | |
bookmark['slow_link'] = True | |
logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.") | |
except Exception as e: | |
bookmark['dead_link'] = True | |
bookmark['etag'] = 'N/A' | |
bookmark['status_code'] = 'Error' | |
bookmark['description'] = '' | |
bookmark['html_content'] = '' | |
logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True) | |
finally: | |
with lock: | |
fetch_cache[url] = { | |
'etag': bookmark.get('etag'), | |
'status_code': bookmark.get('status_code'), | |
'dead_link': bookmark.get('dead_link'), | |
'description': bookmark.get('description'), | |
'html_content': bookmark.get('html_content', ''), | |
'slow_link': bookmark.get('slow_link', False), | |
} | |
def parse_bookmarks(file_content): | |
""" | |
Parse bookmarks from HTML file. | |
""" | |
logger.info("Parsing bookmarks") | |
try: | |
soup = BeautifulSoup(file_content, 'html.parser') | |
extracted_bookmarks = [] | |
for link in soup.find_all('a'): | |
url = link.get('href') | |
title = link.text.strip() | |
if url and title: | |
if url.startswith('http://') or url.startswith('https://'): | |
extracted_bookmarks.append({'url': url, 'title': title}) | |
else: | |
logger.info(f"Skipping non-http/https URL: {url}") | |
logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks") | |
return extracted_bookmarks | |
except Exception as e: | |
logger.error("Error parsing bookmarks: %s", e, exc_info=True) | |
raise | |
def vectorize_and_index(bookmarks_list): | |
""" | |
Create vector embeddings for bookmarks and build FAISS index with ID mapping. | |
""" | |
global faiss_index | |
logger.info("Vectorizing summaries and building FAISS index") | |
try: | |
summaries = [bookmark['summary'] for bookmark in bookmarks_list] | |
embeddings = embedding_model.encode(summaries) | |
dimension = embeddings.shape[1] | |
index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension)) | |
ids = np.array([bookmark['id'] for bookmark in bookmarks_list], dtype=np.int64) | |
index.add_with_ids(np.array(embeddings).astype('float32'), ids) | |
faiss_index = index | |
logger.info("FAISS index built successfully with IDs") | |
return index | |
except Exception as e: | |
logger.error(f"Error in vectorizing and indexing: {e}", exc_info=True) | |
raise | |
def display_bookmarks(): | |
""" | |
Generate HTML display for bookmarks. | |
""" | |
logger.info("Generating HTML display for bookmarks") | |
cards = '' | |
for i, bookmark in enumerate(bookmarks): | |
index = i + 1 | |
if bookmark.get('dead_link'): | |
status = "β Dead Link" | |
card_style = "border: 2px solid red;" | |
text_style = "color: white;" | |
summary = 'No summary available.' | |
elif bookmark.get('slow_link'): | |
status = "β³ Slow Response" | |
card_style = "border: 2px solid orange;" | |
text_style = "color: white;" | |
summary = bookmark.get('summary', 'No summary available.') | |
else: | |
status = "β Active" | |
card_style = "border: 2px solid green;" | |
text_style = "color: white;" | |
summary = bookmark.get('summary', 'No summary available.') | |
title = bookmark['title'] | |
url = bookmark['url'] | |
etag = bookmark.get('etag', 'N/A') | |
category = bookmark.get('category', 'Uncategorized') | |
# Escape HTML content to prevent XSS attacks | |
from html import escape | |
title = escape(title) | |
url = escape(url) | |
summary = escape(summary) | |
category = escape(category) | |
card_html = f''' | |
<div class="card" style="{card_style} padding: 10px; margin: 10px; border-radius: 5px; background-color: #1e1e1e;"> | |
<div class="card-content"> | |
<h3 style="{text_style}">{index}. {title} {status}</h3> | |
<p style="{text_style}"><strong>Category:</strong> {category}</p> | |
<p style="{text_style}"><strong>URL:</strong> <a href="{url}" target="_blank" style="{text_style}">{url}</a></p> | |
<p style="{text_style}"><strong>ETag:</strong> {etag}</p> | |
<p style="{text_style}"><strong>Summary:</strong> {summary}</p> | |
</div> | |
</div> | |
''' | |
cards += card_html | |
logger.info("HTML display generated") | |
return cards | |
def generate_summary_and_assign_category(bookmark): | |
""" | |
Generate a concise summary and assign a category using a single LLM call. | |
This function is now handled by the LLM worker thread. | |
""" | |
# This function is now deprecated and handled by the worker thread. | |
pass | |
def process_uploaded_file(file, state_bookmarks): | |
""" | |
Process the uploaded bookmarks file. | |
""" | |
global bookmarks, faiss_index | |
logger.info("Processing uploaded file") | |
if file is None: | |
logger.warning("No file uploaded") | |
return "Please upload a bookmarks HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[]) | |
try: | |
file_content = file.decode('utf-8') | |
except UnicodeDecodeError as e: | |
logger.error(f"Error decoding the file: {e}", exc_info=True) | |
return "Error decoding the file. Please ensure it's a valid HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[]) | |
try: | |
bookmarks = parse_bookmarks(file_content) | |
except Exception as e: | |
logger.error(f"Error parsing bookmarks: {e}", exc_info=True) | |
return "Error parsing the bookmarks HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[]) | |
if not bookmarks: | |
logger.warning("No bookmarks found in the uploaded file") | |
return "No bookmarks found in the uploaded file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[]) | |
# Assign unique IDs to bookmarks | |
for idx, bookmark in enumerate(bookmarks): | |
bookmark['id'] = idx | |
# Fetch bookmark info concurrently | |
logger.info("Fetching URL info concurrently") | |
with ThreadPoolExecutor(max_workers=10) as executor: | |
executor.map(fetch_url_info, bookmarks) | |
# Enqueue bookmarks for LLM processing | |
logger.info("Enqueuing bookmarks for LLM processing") | |
for bookmark in bookmarks: | |
llm_queue.put(bookmark) | |
# Wait until all LLM tasks are completed | |
llm_queue.join() | |
logger.info("All LLM tasks have been processed") | |
try: | |
faiss_index = vectorize_and_index(bookmarks) | |
except Exception as e: | |
logger.error(f"Error building FAISS index: {e}", exc_info=True) | |
return "Error building search index.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[]) | |
message = f"β Successfully processed {len(bookmarks)} bookmarks." | |
logger.info(message) | |
# Generate displays and updates | |
bookmark_html = display_bookmarks() | |
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" | |
for i, bookmark in enumerate(bookmarks)] | |
# Update state | |
state_bookmarks = bookmarks.copy() | |
return message, bookmark_html, state_bookmarks, bookmark_html, gr.update(choices=choices) | |
def delete_selected_bookmarks(selected_indices, state_bookmarks): | |
""" | |
Delete selected bookmarks and remove their vectors from the FAISS index. | |
""" | |
global bookmarks, faiss_index | |
if not selected_indices: | |
return "β οΈ No bookmarks selected.", gr.update(choices=[]), display_bookmarks() | |
ids_to_delete = [] | |
indices_to_delete = [] | |
for s in selected_indices: | |
idx = int(s.split('.')[0]) - 1 | |
if 0 <= idx < len(bookmarks): | |
bookmark_id = bookmarks[idx]['id'] | |
ids_to_delete.append(bookmark_id) | |
indices_to_delete.append(idx) | |
logger.info(f"Deleting bookmark at index {idx + 1}") | |
# Remove vectors from FAISS index | |
if faiss_index is not None and ids_to_delete: | |
faiss_index.remove_ids(np.array(ids_to_delete, dtype=np.int64)) | |
# Remove bookmarks from the list (reverse order to avoid index shifting) | |
for idx in sorted(indices_to_delete, reverse=True): | |
bookmarks.pop(idx) | |
message = "ποΈ Selected bookmarks deleted successfully." | |
logger.info(message) | |
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" | |
for i, bookmark in enumerate(bookmarks)] | |
# Update state | |
state_bookmarks = bookmarks.copy() | |
return message, gr.update(choices=choices), display_bookmarks() | |
def edit_selected_bookmarks_category(selected_indices, new_category, state_bookmarks): | |
""" | |
Edit category of selected bookmarks. | |
""" | |
if not selected_indices: | |
return "β οΈ No bookmarks selected.", gr.update(choices=[]), display_bookmarks(), state_bookmarks | |
if not new_category: | |
return "β οΈ No new category selected.", gr.update(choices=[]), display_bookmarks(), state_bookmarks | |
indices = [int(s.split('.')[0])-1 for s in selected_indices] | |
for idx in indices: | |
if 0 <= idx < len(bookmarks): | |
bookmarks[idx]['category'] = new_category | |
logger.info(f"Updated category for bookmark {idx + 1} to {new_category}") | |
message = "βοΈ Category updated for selected bookmarks." | |
logger.info(message) | |
# Update choices and display | |
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" | |
for i, bookmark in enumerate(bookmarks)] | |
# Update state | |
state_bookmarks = bookmarks.copy() | |
return message, gr.update(choices=choices), display_bookmarks(), state_bookmarks | |
def export_bookmarks(): | |
""" | |
Export bookmarks to an HTML file. | |
""" | |
if not bookmarks: | |
logger.warning("No bookmarks to export") | |
return None | |
try: | |
logger.info("Exporting bookmarks to HTML") | |
soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1><Title>Bookmarks</Title><H1>Bookmarks</H1>", 'html.parser') | |
dl = soup.new_tag('DL') | |
for bookmark in bookmarks: | |
dt = soup.new_tag('DT') | |
a = soup.new_tag('A', href=bookmark['url']) | |
a.string = bookmark['title'] | |
dt.append(a) | |
dl.append(dt) | |
soup.append(dl) | |
html_content = str(soup) | |
output_file = "exported_bookmarks.html" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write(html_content) | |
logger.info("Bookmarks exported successfully") | |
return output_file | |
except Exception as e: | |
logger.error(f"Error exporting bookmarks: {e}", exc_info=True) | |
return None | |
def chatbot_response(user_query, chat_history): | |
""" | |
Generate chatbot response using the FAISS index and embeddings. | |
""" | |
if not bookmarks or faiss_index is None: | |
logger.warning("No bookmarks available for chatbot") | |
chat_history.append({"role": "assistant", "content": "β οΈ No bookmarks available. Please upload and process your bookmarks first."}) | |
return chat_history | |
logger.info(f"Chatbot received query: {user_query}") | |
try: | |
chat_history.append({"role": "user", "content": user_query}) | |
# Rate Limiting | |
rpm_bucket.wait_for_token() | |
# Estimate tokens: prompt + max_tokens | |
# Here, we assume max_tokens=300 per chatbot response | |
total_tokens = 300 # Adjust based on actual usage | |
tpm_bucket.wait_for_token(tokens=total_tokens) | |
query_vector = embedding_model.encode([user_query]).astype('float32') | |
k = 5 | |
distances, ids = faiss_index.search(query_vector, k) | |
ids = ids.flatten() | |
id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks} | |
matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark and id_to_bookmark.get(id).get('summary')] | |
if not matching_bookmarks: | |
answer = "No relevant bookmarks found for your query." | |
chat_history.append({"role": "assistant", "content": answer}) | |
return chat_history | |
bookmarks_info = "\n".join([ | |
f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}" | |
for bookmark in matching_bookmarks | |
]) | |
prompt = f""" | |
A user asked: "{user_query}" | |
Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks. | |
Bookmarks: | |
{bookmarks_info} | |
Provide a concise and helpful response. | |
""" | |
response = openai.ChatCompletion.create( | |
model='llama-3.1-70b-versatile', # Ensure this model is correct and available | |
messages=[ | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=300, | |
temperature=0.7, | |
) | |
answer = response['choices'][0]['message']['content'].strip() | |
logger.info("Chatbot response generated") | |
chat_history.append({"role": "assistant", "content": answer}) | |
return chat_history | |
except openai.error.RateLimitError as e: | |
wait_time = int(e.headers.get("Retry-After", 5)) | |
logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...") | |
time.sleep(wait_time) | |
return chatbot_response(user_query, chat_history) | |
except Exception as e: | |
error_message = f"β οΈ Error processing your query: {str(e)}" | |
logger.error(error_message, exc_info=True) | |
chat_history.append({"role": "assistant", "content": error_message}) | |
return chat_history | |
def build_app(): | |
""" | |
Build and launch the Gradio app. | |
""" | |
try: | |
logger.info("Building Gradio app") | |
with gr.Blocks(css="app.css") as demo: | |
# Initialize state | |
state_bookmarks = gr.State([]) | |
# General Overview | |
gr.Markdown(""" | |
# π SmartMarks - AI Browser Bookmarks Manager | |
Welcome to **SmartMarks**, your intelligent assistant for managing browser bookmarks. SmartMarks leverages AI to help you organize, search, and interact with your bookmarks seamlessly. | |
--- | |
## π **How to Use SmartMarks** | |
SmartMarks is divided into three main sections: | |
1. **π Upload and Process Bookmarks:** Import your existing bookmarks and let SmartMarks analyze and categorize them for you. | |
2. **π¬ Chat with Bookmarks:** Interact with your bookmarks using natural language queries to find relevant links effortlessly. | |
3. **π οΈ Manage Bookmarks:** View, edit, delete, and export your bookmarks with ease. | |
Navigate through the tabs to explore each feature in detail. | |
""") | |
# Upload and Process Bookmarks Tab | |
with gr.Tab("Upload and Process Bookmarks"): | |
gr.Markdown(""" | |
## π **Upload and Process Bookmarks** | |
### π **Steps to Upload and Process:** | |
1. **Upload Bookmarks File:** | |
- Click on the **"π Upload Bookmarks HTML File"** button. | |
- Select your browser's exported bookmarks HTML file from your device. | |
2. **Process Bookmarks:** | |
- After uploading, click on the **"βοΈ Process Bookmarks"** button. | |
- SmartMarks will parse your bookmarks, fetch additional information, generate summaries, and categorize each link based on predefined categories. | |
3. **View Processed Bookmarks:** | |
- Once processing is complete, your bookmarks will be displayed in an organized and visually appealing format below. | |
""") | |
upload = gr.File(label="π Upload Bookmarks HTML File", type='binary') | |
process_button = gr.Button("βοΈ Process Bookmarks") | |
output_text = gr.Textbox(label="β Output", interactive=False) | |
bookmark_display = gr.HTML(label="π Processed Bookmarks") | |
# Chat with Bookmarks Tab | |
with gr.Tab("Chat with Bookmarks"): | |
gr.Markdown(""" | |
## π¬ **Chat with Bookmarks** | |
### π€ **How to Interact:** | |
1. **Enter Your Query:** | |
- In the **"βοΈ Ask about your bookmarks"** textbox, type your question or keyword related to your bookmarks. | |
2. **Submit Your Query:** | |
- Click the **"π¨ Send"** button to submit your query. | |
3. **Receive AI-Driven Responses:** | |
- SmartMarks will analyze your query and provide relevant bookmarks that match your request. | |
4. **View Chat History:** | |
- All your queries and the corresponding AI responses are displayed in the chat history. | |
""") | |
chatbot = gr.Chatbot(label="π¬ Chat with SmartMarks", type='messages') | |
user_input = gr.Textbox( | |
label="βοΈ Ask about your bookmarks", | |
placeholder="e.g., Do I have any bookmarks about AI?" | |
) | |
chat_button = gr.Button("π¨ Send") | |
chat_button.click( | |
chatbot_response, | |
inputs=[user_input, chatbot], | |
outputs=chatbot | |
) | |
# Manage Bookmarks Tab | |
with gr.Tab("Manage Bookmarks"): | |
gr.Markdown(""" | |
## π οΈ **Manage Bookmarks** | |
### ποΈ **Features:** | |
1. **View Bookmarks:** | |
- All your processed bookmarks are displayed here with their respective categories and summaries. | |
2. **Select Bookmarks:** | |
- Use the checkboxes next to each bookmark to select one, multiple, or all bookmarks you wish to manage. | |
3. **Delete Selected Bookmarks:** | |
- After selecting the desired bookmarks, click the **"ποΈ Delete Selected"** button to remove them from your list. | |
4. **Edit Categories:** | |
- Select the bookmarks you want to re-categorize. | |
- Choose a new category from the dropdown menu labeled **"π New Category"**. | |
- Click the **"βοΈ Edit Category"** button to update their categories. | |
5. **Export Bookmarks:** | |
- Click the **"πΎ Export"** button to download your updated bookmarks as an HTML file. | |
6. **Refresh Bookmarks:** | |
- Click the **"π Refresh Bookmarks"** button to ensure the latest state is reflected in the display. | |
""") | |
manage_output = gr.Textbox(label="π Status", interactive=False) | |
# CheckboxGroup for selecting bookmarks | |
bookmark_selector = gr.CheckboxGroup( | |
label="β Select Bookmarks", | |
choices=[] | |
) | |
new_category = gr.Dropdown( | |
label="π New Category", | |
choices=CATEGORIES, | |
value="Uncategorized" | |
) | |
bookmark_display_manage = gr.HTML(label="π Bookmarks") | |
with gr.Row(): | |
delete_button = gr.Button("ποΈ Delete Selected") | |
edit_category_button = gr.Button("βοΈ Edit Category") | |
export_button = gr.Button("πΎ Export") | |
refresh_button = gr.Button("π Refresh Bookmarks") | |
download_link = gr.File(label="π₯ Download Exported Bookmarks") | |
# Connect all the button actions | |
process_button.click( | |
process_uploaded_file, | |
inputs=[upload, state_bookmarks], | |
outputs=[output_text, bookmark_display, state_bookmarks, bookmark_display, bookmark_selector] | |
) | |
delete_button.click( | |
delete_selected_bookmarks, | |
inputs=[bookmark_selector, state_bookmarks], | |
outputs=[manage_output, bookmark_selector, bookmark_display_manage] | |
) | |
edit_category_button.click( | |
edit_selected_bookmarks_category, | |
inputs=[bookmark_selector, new_category, state_bookmarks], | |
outputs=[manage_output, bookmark_selector, bookmark_display_manage, state_bookmarks] | |
) | |
export_button.click( | |
export_bookmarks, | |
outputs=download_link | |
) | |
refresh_button.click( | |
lambda state_bookmarks: ( | |
[ | |
f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" | |
for i, bookmark in enumerate(state_bookmarks) | |
], | |
display_bookmarks() | |
), | |
inputs=[state_bookmarks], | |
outputs=[bookmark_selector, bookmark_display_manage] | |
) | |
logger.info("Launching Gradio app") | |
demo.launch(debug=True) | |
except Exception as e: | |
logger.error(f"Error building Gradio app: {e}", exc_info=True) | |
print(f"Error building Gradio app: {e}") | |
if __name__ == "__main__": | |
# Start the LLM worker thread before launching the app | |
llm_thread = threading.Thread(target=llm_worker, daemon=True) | |
llm_thread.start() | |
build_app() | |