Spaces:
Running
Running
# app.py | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import asyncio | |
import aiohttp | |
import re | |
import base64 | |
import logging | |
import os | |
import sys | |
import urllib.parse | |
# Import OpenAI library | |
import openai | |
# Set up logging to output to the console | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
# Create a console handler | |
console_handler = logging.StreamHandler(sys.stdout) | |
console_handler.setLevel(logging.INFO) | |
# Create a formatter and set it for the handler | |
formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s') | |
console_handler.setFormatter(formatter) | |
# Add the handler to the logger | |
logger.addHandler(console_handler) | |
# Initialize models and variables | |
logger.info("Initializing models and variables") | |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
faiss_index = None | |
bookmarks = [] | |
fetch_cache = {} | |
# Define the categories | |
CATEGORIES = [ | |
"Social Media", | |
"News and Media", | |
"Education and Learning", | |
"Entertainment", | |
"Shopping and E-commerce", | |
"Finance and Banking", | |
"Technology", | |
"Health and Fitness", | |
"Travel and Tourism", | |
"Food and Recipes", | |
"Sports", | |
"Arts and Culture", | |
"Government and Politics", | |
"Business and Economy", | |
"Science and Research", | |
"Personal Blogs and Journals", | |
"Job Search and Careers", | |
"Music and Audio", | |
"Videos and Movies", | |
"Reference and Knowledge Bases", | |
"Dead Link", | |
"Uncategorized", | |
] | |
# Set up Groq Cloud API key and base URL | |
GROQ_API_KEY = os.getenv('GROQ_API_KEY') | |
if not GROQ_API_KEY: | |
logger.error("GROQ_API_KEY environment variable not set.") | |
# Set OpenAI API key and base URL to use Groq Cloud API | |
openai.api_key = GROQ_API_KEY | |
openai.api_base = "https://api.groq.com/openai/v1" | |
def determine_page_type(soup, url): | |
""" | |
Determine the type of webpage for better content extraction. | |
""" | |
url_lower = url.lower() | |
# Check for common platforms | |
if 'facebook.com' in url_lower: | |
return 'social_media_profile' | |
elif 'wikipedia.org' in url_lower: | |
return 'wiki_article' | |
elif any(domain in url_lower for domain in ['news', 'huffpost', 'times']): | |
return 'news_article' | |
elif 'youtube.com' in url_lower: | |
return 'video_platform' | |
elif '.gov' in url_lower or 'government' in url_lower: | |
return 'government_site' | |
elif 'x.com' in url_lower or 'twitter.com' in url_lower: | |
return 'social_media_platform' | |
# Check page structure | |
if soup.find('article'): | |
return 'article' | |
elif soup.find(['shop', 'product', 'price']): | |
return 'ecommerce' | |
elif soup.find(['forum', 'comment', 'discussion']): | |
return 'forum' | |
return 'general' | |
def extract_main_content_by_type(soup, page_type): | |
""" | |
Extract content based on page type for better relevance. | |
""" | |
if not soup: | |
return "" | |
content = "" | |
if page_type == 'news_article': | |
# Try to find the main article content | |
article_body = soup.find(['article', 'main', 'div'], | |
class_=lambda x: x and any(c in str(x).lower() | |
for c in ['article', 'story', 'content', 'body'])) | |
if article_body: | |
# Get first few paragraphs | |
paragraphs = article_body.find_all('p') | |
content = ' '.join(p.get_text() for p in paragraphs[:5]) | |
elif page_type == 'wiki_article': | |
# For Wikipedia articles | |
content_div = soup.find('div', {'id': 'mw-content-text'}) | |
if content_div: | |
paragraphs = content_div.find_all('p') | |
content = ' '.join(p.get_text() for p in paragraphs[:3]) | |
elif page_type in ['social_media_profile', 'social_media_platform']: | |
# For social media pages | |
about_section = soup.find(['div', 'section'], | |
class_=lambda x: x and any(c in str(x).lower() | |
for c in ['about', 'bio', 'profile', 'description'])) | |
if about_section: | |
content = about_section.get_text() | |
else: | |
# Try to get main content area | |
content = soup.find(['div', 'main'], | |
class_=lambda x: x and 'content' in str(x).lower()) | |
if content: | |
content = content.get_text() | |
# If no content found using specific extractors, use general extraction | |
if not content.strip(): | |
# Remove unwanted elements | |
for element in soup(['script', 'style', 'nav', 'footer', 'header']): | |
element.decompose() | |
# Try to find main content area | |
main_content = soup.find(['main', 'article', 'div'], | |
class_=lambda x: x and 'content' in str(x).lower()) | |
if main_content: | |
# Get all text from paragraphs | |
paragraphs = main_content.find_all('p') | |
content = ' '.join(p.get_text() for p in paragraphs) | |
else: | |
# Fallback to body content | |
content = soup.get_text() | |
# Clean the extracted content | |
content = clean_text(content) | |
return content[:5000] # Limit content length | |
def clean_text(text): | |
""" | |
Clean extracted text content. | |
""" | |
if not text: | |
return "" | |
# Convert to string if necessary | |
text = str(text) | |
# Remove extra whitespace | |
text = re.sub(r'\s+', ' ', text) | |
# Remove special characters but keep basic punctuation | |
text = re.sub(r'[^\w\s.,!?-]', '', text) | |
# Remove multiple punctuation | |
text = re.sub(r'([.,!?])\1+', r'\1', text) | |
# Remove very short words (likely garbage) | |
text = ' '.join(word for word in text.split() if len(word) > 1) | |
return text.strip() | |
def get_page_metadata(soup): | |
""" | |
Extract metadata from the webpage including title, description, and keywords. | |
""" | |
metadata = { | |
'title': '', | |
'description': '', | |
'keywords': '' | |
} | |
if not soup: | |
return metadata | |
# Get title (try multiple sources) | |
title_tag = soup.find('title') | |
og_title = soup.find('meta', {'property': 'og:title'}) | |
twitter_title = soup.find('meta', {'name': 'twitter:title'}) | |
if title_tag and title_tag.string: | |
metadata['title'] = title_tag.string.strip() | |
elif og_title and og_title.get('content'): | |
metadata['title'] = og_title.get('content').strip() | |
elif twitter_title and twitter_title.get('content'): | |
metadata['title'] = twitter_title.get('content').strip() | |
# Get meta description (try multiple sources) | |
desc_sources = [ | |
('meta', {'name': 'description'}), | |
('meta', {'property': 'og:description'}), | |
('meta', {'name': 'twitter:description'}), | |
] | |
for tag, attrs in desc_sources: | |
desc = soup.find(tag, attrs=attrs) | |
if desc and desc.get('content'): | |
metadata['description'] = desc.get('content').strip() | |
break | |
# Get meta keywords | |
keywords_tag = soup.find('meta', {'name': 'keywords'}) | |
if keywords_tag and keywords_tag.get('content'): | |
metadata['keywords'] = keywords_tag.get('content').strip() | |
return metadata | |
def generate_contextual_summary(context): | |
""" | |
Generate summary with context awareness using LLM. | |
""" | |
page_type = context['page_type'] | |
# Customize prompt based on page type | |
type_specific_prompts = { | |
'news_article': "This is a news article. Focus on the main news event, key facts, and significance.", | |
'wiki_article': "This is a Wikipedia article. Focus on the main topic, key facts, and historical context.", | |
'social_media_profile': "This is a social media profile. Focus on the platform's purpose and key features.", | |
'social_media_platform': "This is a social media platform. Describe its main purpose and unique features.", | |
'ecommerce': "This is an e-commerce site. Focus on what products/services are offered and target audience.", | |
'government_site': "This is a government website. Focus on services offered and public information provided.", | |
'video_platform': "This is a video platform. Describe its main purpose and content sharing features.", | |
'general': "Describe the main purpose and key features of this webpage." | |
} | |
prompt = f""" | |
Analyze this webpage and create a clear, factual summary: | |
Title: {context['title']} | |
Type: {page_type} | |
Description: {context['description']} | |
Keywords: {context['keywords']} | |
Additional Content: | |
{context['content'][:3000]} | |
{type_specific_prompts.get(page_type, type_specific_prompts['general'])} | |
Create a natural, informative 2-3 sentence summary that: | |
1. States the primary purpose/main topic | |
2. Mentions key features or information | |
3. Indicates target audience or use case (if clear) | |
Keep the tone professional and factual. | |
""" | |
try: | |
response = openai.ChatCompletion.create( | |
model='llama3-8b-8192', | |
messages=[ | |
{"role": "system", "content": "You are a precise webpage summarizer that creates clear, accurate summaries."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=150, | |
temperature=0.3, | |
) | |
return response['choices'][0]['message']['content'].strip() | |
except Exception as e: | |
logger.error(f"Error generating LLM summary: {e}") | |
return None | |
def generate_summary(bookmark): | |
""" | |
Generate a comprehensive summary for a bookmark using available content and LLM. | |
""" | |
logger.info(f"Generating summary for {bookmark.get('url')}") | |
try: | |
soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser') | |
# 1. Extract all available metadata | |
metadata = get_page_metadata(soup) | |
# 2. Determine page type and context | |
page_type = determine_page_type(soup, bookmark['url']) | |
# 3. Extract relevant content based on page type | |
main_content = extract_main_content_by_type(soup, page_type) | |
# 4. Generate summary using LLM with contextual awareness | |
try: | |
context = { | |
'title': metadata['title'] or bookmark.get('title', ''), | |
'description': metadata['description'], | |
'keywords': metadata['keywords'], | |
'page_type': page_type, | |
'content': main_content | |
} | |
summary = generate_contextual_summary(context) | |
if summary: | |
bookmark['summary'] = summary | |
return bookmark | |
except Exception as e: | |
logger.error(f"Error in LLM summary generation: {e}") | |
# Fallback mechanism | |
if metadata['description']: | |
bookmark['summary'] = metadata['description'] | |
elif main_content: | |
bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...' | |
else: | |
bookmark['summary'] = metadata.get('title', bookmark.get('title', 'No summary available.')) | |
except Exception as e: | |
logger.error(f"Error in generate_summary: {e}") | |
bookmark['summary'] = bookmark.get('title', 'No summary available.') | |
return bookmark | |
async def fetch_url_info(session, bookmark): | |
""" | |
Enhanced URL fetching with better error handling and request configuration. | |
""" | |
url = bookmark['url'] | |
if url in fetch_cache: | |
bookmark.update(fetch_cache[url]) | |
return bookmark | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1', | |
'Sec-Fetch-Dest': 'document', | |
'Sec-Fetch-Mode': 'navigate', | |
'Sec-Fetch-Site': 'none', | |
'Sec-Fetch-User': '?1', | |
'Cache-Control': 'max-age=0' | |
} | |
try: | |
logger.info(f"Fetching URL info for: {url}") | |
timeout = aiohttp.ClientTimeout(total=30) | |
async with session.get( | |
url, | |
timeout=timeout, | |
headers=headers, | |
ssl=False, | |
allow_redirects=True | |
) as response: | |
status = response.status | |
bookmark['status_code'] = status | |
bookmark['etag'] = response.headers.get('ETag', 'N/A') | |
# Handle different status codes | |
if status == 200: | |
content = await response.text() | |
bookmark['html_content'] = content | |
bookmark['dead_link'] = False | |
bookmark['description'] = '' # Will be set by generate_summary | |
logger.info(f"Successfully fetched content for {url}") | |
elif status in [301, 302, 307, 308]: | |
# Handle redirects manually if needed | |
bookmark['dead_link'] = False | |
bookmark['html_content'] = '' | |
logger.info(f"Redirect detected for {url}") | |
else: | |
bookmark['dead_link'] = True | |
bookmark['html_content'] = '' | |
logger.warning(f"Non-success status {status} for {url}") | |
except asyncio.TimeoutError: | |
logger.warning(f"Timeout while fetching {url}") | |
bookmark['dead_link'] = False # Don't mark as dead just because of timeout | |
bookmark['status_code'] = 'Timeout' | |
except Exception as e: | |
logger.error(f"Error fetching {url}: {str(e)}") | |
bookmark['dead_link'] = False # Don't mark as dead for other errors | |
bookmark['status_code'] = str(e) | |
finally: | |
# Ensure all required fields are present | |
bookmark.setdefault('html_content', '') | |
bookmark.setdefault('description', '') | |
bookmark.setdefault('etag', 'N/A') | |
# Update cache | |
fetch_cache[url] = { | |
'etag': bookmark.get('etag'), | |
'status_code': bookmark.get('status_code'), | |
'dead_link': bookmark.get('dead_link'), | |
'description': bookmark.get('description'), | |
'html_content': bookmark.get('html_content', '') | |
} | |
return bookmark | |
async def process_bookmarks_async(bookmarks_list): | |
""" | |
Process all bookmarks asynchronously with improved error handling. | |
""" | |
logger.info("Processing bookmarks asynchronously") | |
try: | |
# Configure connection pool and timeout | |
tcp_connector = aiohttp.TCPConnector( | |
limit=5, # Limit concurrent connections | |
force_close=True, # Force close connections | |
enable_cleanup_closed=True, # Clean up closed connections | |
ssl=False # Disable SSL verification | |
) | |
timeout = aiohttp.ClientTimeout(total=30) | |
async with aiohttp.ClientSession( | |
connector=tcp_connector, | |
timeout=timeout, | |
raise_for_status=False # Don't raise exceptions for non-200 status | |
) as session: | |
tasks = [] | |
for bookmark in bookmarks_list: | |
task = asyncio.ensure_future(fetch_url_info(session, bookmark)) | |
tasks.append(task) | |
# Process bookmarks in batches to avoid overwhelming servers | |
batch_size = 5 | |
for i in range(0, len(tasks), batch_size): | |
batch = tasks[i:i + batch_size] | |
await asyncio.gather(*batch) | |
await asyncio.sleep(1) # Small delay between batches | |
logger.info("Completed processing bookmarks asynchronously") | |
except Exception as e: | |
logger.error(f"Error in asynchronous processing of bookmarks: {e}") | |
raise | |
def parse_bookmarks(file_content): | |
""" | |
Parse bookmarks from HTML file with enhanced error handling. | |
""" | |
logger.info("Parsing bookmarks") | |
try: | |
soup = BeautifulSoup(file_content, 'html.parser') | |
extracted_bookmarks = [] | |
# Find all bookmark links | |
for link in soup.find_all('a'): | |
url = link.get('href', '').strip() | |
title = link.text.strip() | |
# Validate URL and title | |
if url and title and url.startswith(('http://', 'https://')): | |
# Clean and normalize URL | |
parsed_url = urllib.parse.urlparse(url) | |
normalized_url = urllib.parse.urlunparse(parsed_url) | |
bookmark = { | |
'url': normalized_url, | |
'title': title, | |
'add_date': link.get('add_date', ''), | |
'icon': link.get('icon', '') | |
} | |
extracted_bookmarks.append(bookmark) | |
logger.info(f"Extracted {len(extracted_bookmarks)} valid bookmarks") | |
return extracted_bookmarks | |
except Exception as e: | |
logger.error(f"Error parsing bookmarks: {e}") | |
raise | |
def vectorize_and_index(bookmarks_list): | |
""" | |
Create vector embeddings for bookmarks and build FAISS index. | |
""" | |
logger.info("Vectorizing summaries and building FAISS index") | |
try: | |
# Prepare summaries for vectorization | |
summaries = [] | |
for bookmark in bookmarks_list: | |
summary = bookmark.get('summary', '').strip() | |
title = bookmark.get('title', '').strip() | |
# Combine title and summary for better embedding | |
text = f"{title} {summary}".strip() | |
summaries.append(text if text else "No content available") | |
# Generate embeddings | |
embeddings = embedding_model.encode(summaries) | |
# Create and configure FAISS index | |
dimension = embeddings.shape[1] | |
faiss_idx = faiss.IndexFlatL2(dimension) | |
# Add vectors to index | |
faiss_idx.add(np.array(embeddings)) | |
logger.info("FAISS index built successfully") | |
return faiss_idx, embeddings | |
except Exception as e: | |
logger.error(f"Error in vectorizing and indexing: {e}") | |
raise | |
def display_bookmarks(): | |
""" | |
Generate HTML display for bookmarks with enhanced styling. | |
""" | |
logger.info("Generating HTML display for bookmarks") | |
cards = '' | |
for i, bookmark in enumerate(bookmarks): | |
index = i + 1 | |
status = "❌ Dead Link" if bookmark.get('dead_link') else "✅ Active" | |
title = bookmark['title'] | |
url = bookmark['url'] | |
etag = bookmark.get('etag', 'N/A') | |
summary = bookmark.get('summary', '') | |
category = bookmark.get('category', 'Uncategorized') | |
status_code = bookmark.get('status_code', 'N/A') | |
# Enhanced styling based on status | |
if bookmark.get('dead_link'): | |
card_style = "border: 2px solid #ff4444; background-color: rgba(255, 68, 68, 0.1);" | |
text_style = "color: #ff4444;" | |
else: | |
card_style = "border: 2px solid #00C851; background-color: rgba(0, 200, 81, 0.1);" | |
text_style = "color: var(--text-color);" | |
# Properly escape any backslashes if present in summary or other fields | |
# (Not strictly necessary here, but good practice) | |
summary_escaped = summary.replace('\\', '\\\\') | |
card_html = f''' | |
<div class="card" style="{card_style} padding: 15px; margin: 15px 0; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);"> | |
<div class="card-content"> | |
<h3 style="{text_style} margin-bottom: 10px; font-size: 1.2em;"> | |
{index}. {title} {status} | |
{f'<span style="font-size: 0.8em; color: #666;">({status_code})</span>' if status_code != 'N/A' else ''} | |
</h3> | |
<p style="{text_style}"><strong>Category:</strong> {category}</p> | |
<p style="{text_style}"><strong>URL:</strong> <a href="{url}" target="_blank" style="{text_style}">{url}</a></p> | |
<p style="{text_style}"><strong>ETag:</strong> {etag}</p> | |
<p style="{text_style}"><strong>Summary:</strong> {summary_escaped}</p> | |
</div> | |
</div> | |
''' | |
cards += card_html | |
# Add container with max width and padding | |
display_html = f''' | |
<div style="max-width: 1200px; margin: 0 auto; padding: 20px;"> | |
{cards} | |
</div> | |
''' | |
logger.info("HTML display generated") | |
return display_html | |
def assign_category(bookmark): | |
""" | |
Assign a category to a bookmark based on its title or summary. | |
This is a simple implementation and can be enhanced with more sophisticated methods. | |
""" | |
title = bookmark.get('title', '').lower() | |
summary = bookmark.get('summary', '').lower() | |
# Simple keyword-based categorization | |
if any(keyword in title or keyword in summary for keyword in ['facebook', 'twitter', 'instagram']): | |
bookmark['category'] = 'Social Media' | |
elif any(keyword in title or keyword in summary for keyword in ['news', 'media', 'huffpost', 'times']): | |
bookmark['category'] = 'News and Media' | |
elif any(keyword in title or keyword in summary for keyword in ['course', 'learning', 'education']): | |
bookmark['category'] = 'Education and Learning' | |
elif any(keyword in title or keyword in summary for keyword in ['movie', 'music', 'audio', 'video']): | |
bookmark['category'] = 'Entertainment' | |
elif any(keyword in title or keyword in summary for keyword in ['shop', 'e-commerce', 'buy', 'purchase']): | |
bookmark['category'] = 'Shopping and E-commerce' | |
elif any(keyword in title or keyword in summary for keyword in ['finance', 'banking', 'investment']): | |
bookmark['category'] = 'Finance and Banking' | |
elif any(keyword in title or keyword in summary for keyword in ['tech', 'technology', 'software']): | |
bookmark['category'] = 'Technology' | |
elif any(keyword in title or keyword in summary for keyword in ['health', 'fitness', 'wellness']): | |
bookmark['category'] = 'Health and Fitness' | |
elif any(keyword in title or keyword in summary for keyword in ['travel', 'tourism', 'flight', 'hotel']): | |
bookmark['category'] = 'Travel and Tourism' | |
elif any(keyword in title or keyword in summary for keyword in ['recipe', 'food', 'cooking']): | |
bookmark['category'] = 'Food and Recipes' | |
elif any(keyword in title or keyword in summary for keyword in ['sport', 'game', 'fitness']): | |
bookmark['category'] = 'Sports' | |
elif any(keyword in title or keyword in summary for keyword in ['art', 'culture', 'museum']): | |
bookmark['category'] = 'Arts and Culture' | |
elif any(keyword in title or keyword in summary for keyword in ['gov', 'government', 'politics']): | |
bookmark['category'] = 'Government and Politics' | |
elif any(keyword in title or keyword in summary for keyword in ['business', 'economy', 'market']): | |
bookmark['category'] = 'Business and Economy' | |
elif any(keyword in title or keyword in summary for keyword in ['science', 'research', 'study']): | |
bookmark['category'] = 'Science and Research' | |
elif any(keyword in title or keyword in summary for keyword in ['blog', 'journal']): | |
bookmark['category'] = 'Personal Blogs and Journals' | |
elif any(keyword in title or keyword in summary for keyword in ['job', 'career', 'employment']): | |
bookmark['category'] = 'Job Search and Careers' | |
elif any(keyword in title or keyword in summary for keyword in ['audio', 'music']): | |
bookmark['category'] = 'Music and Audio' | |
elif any(keyword in title or keyword in summary for keyword in ['video', 'movie']): | |
bookmark['category'] = 'Videos and Movies' | |
elif any(keyword in title or keyword in summary for keyword in ['reference', 'knowledge', 'wiki']): | |
bookmark['category'] = 'Reference and Knowledge Bases' | |
elif bookmark.get('dead_link'): | |
bookmark['category'] = 'Dead Link' | |
else: | |
bookmark['category'] = 'Uncategorized' | |
def process_uploaded_file(file, delete_checkbox, edit_checkbox): | |
""" | |
Process the uploaded bookmarks file with enhanced error handling and user feedback. | |
""" | |
global bookmarks, faiss_index | |
logger.info("Processing uploaded file") | |
if file is None: | |
return "⚠️ Please upload a bookmarks HTML file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[]) | |
try: | |
file_content = file.decode('utf-8') | |
except UnicodeDecodeError as e: | |
logger.error(f"Error decoding file: {e}") | |
return "⚠️ Error decoding file. Please ensure it's a valid HTML file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[]) | |
try: | |
bookmarks = parse_bookmarks(file_content) | |
except Exception as e: | |
logger.error(f"Error parsing bookmarks: {e}") | |
return "⚠️ Error parsing the bookmarks file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[]) | |
if not bookmarks: | |
return "⚠️ No valid bookmarks found in the file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[]) | |
try: | |
logger.info("Processing bookmarks...") | |
asyncio.run(process_bookmarks_async(bookmarks)) | |
# Process in batches for progress tracking | |
total = len(bookmarks) | |
for i, bookmark in enumerate(bookmarks, 1): | |
generate_summary(bookmark) | |
assign_category(bookmark) | |
logger.info(f"Processed bookmark {i}/{total}") | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
message = f"✅ Successfully processed {len(bookmarks)} bookmarks!" | |
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" | |
for i, bookmark in enumerate(bookmarks)] | |
bookmark_html = display_bookmarks() | |
return message, bookmark_html, gr.CheckboxGroup.update(choices=choices), gr.CheckboxGroup.update(choices=choices) | |
except Exception as e: | |
logger.error(f"Error processing bookmarks: {e}") | |
return "⚠️ Error processing bookmarks. Please try again.", '', gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]) | |
def delete_selected_bookmarks(selected_indices, delete_checkbox, edit_checkbox): | |
""" | |
Delete selected bookmarks with enhanced error handling. | |
""" | |
global bookmarks, faiss_index | |
if not selected_indices: | |
return "⚠️ No bookmarks selected.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks() | |
try: | |
indices = [int(s.split('.')[0])-1 for s in selected_indices] | |
indices = sorted(indices, reverse=True) | |
deleted_count = 0 | |
for idx in indices: | |
if 0 <= idx < len(bookmarks): | |
logger.info(f"Deleting bookmark: {bookmarks[idx]['title']}") | |
bookmarks.pop(idx) | |
deleted_count += 1 | |
if bookmarks: | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
else: | |
faiss_index = None | |
message = f"✅ Successfully deleted {deleted_count} bookmark{'s' if deleted_count != 1 else ''}." | |
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" | |
for i, bookmark in enumerate(bookmarks)] | |
return message, gr.CheckboxGroup.update(choices=choices), gr.CheckboxGroup.update(choices=choices), display_bookmarks() | |
except Exception as e: | |
logger.error(f"Error deleting bookmarks: {e}") | |
return "⚠️ Error deleting bookmarks.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks() | |
def edit_selected_bookmarks_category(selected_indices, new_category, delete_checkbox, edit_checkbox): | |
""" | |
Edit category of selected bookmarks with enhanced error handling. | |
""" | |
if not selected_indices: | |
return "⚠️ No bookmarks selected.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks() | |
if not new_category: | |
return "⚠️ No new category selected.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks() | |
try: | |
indices = [int(s.split('.')[0])-1 for s in selected_indices] | |
updated_count = 0 | |
for idx in indices: | |
if 0 <= idx < len(bookmarks): | |
old_category = bookmarks[idx]['category'] | |
bookmarks[idx]['category'] = new_category | |
logger.info(f"Updated category for '{bookmarks[idx]['title']}' from '{old_category}' to '{new_category}'") | |
updated_count += 1 | |
message = f"✅ Updated category for {updated_count} bookmark{'s' if updated_count != 1 else ''}." | |
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" | |
for i, bookmark in enumerate(bookmarks)] | |
return message, gr.CheckboxGroup.update(choices=choices), gr.CheckboxGroup.update(choices=choices), display_bookmarks() | |
except Exception as e: | |
logger.error(f"Error updating categories: {e}") | |
return "⚠️ Error updating categories.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks() | |
def export_bookmarks(): | |
""" | |
Export bookmarks to HTML file with enhanced formatting. | |
""" | |
if not bookmarks: | |
return "⚠️ No bookmarks to export." | |
try: | |
logger.info("Exporting bookmarks") | |
soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1>", 'html.parser') | |
# Add metadata | |
meta = soup.new_tag('META') | |
meta['HTTP-EQUIV'] = 'Content-Type' | |
meta['CONTENT'] = 'text/html; charset=UTF-8' | |
soup.append(meta) | |
# Add title | |
title = soup.new_tag('TITLE') | |
title.string = 'Bookmarks' | |
soup.append(title) | |
# Add heading | |
h1 = soup.new_tag('H1') | |
h1.string = 'Bookmarks' | |
soup.append(h1) | |
# Create main bookmark list | |
dl = soup.new_tag('DL') | |
soup.append(dl) | |
# Add bookmarks with categories | |
current_category = None | |
for bookmark in bookmarks: | |
category = bookmark.get('category', 'Uncategorized') | |
# Create category folder if needed | |
if category != current_category: | |
current_category = category | |
dt_cat = soup.new_tag('DT') | |
h3_cat = soup.new_tag('H3') | |
h3_cat.string = category | |
dt_cat.append(h3_cat) | |
dl_cat = soup.new_tag('DL') | |
dt_cat.append(dl_cat) | |
dl.append(dt_cat) | |
# Add bookmark | |
dt = soup.new_tag('DT') | |
a = soup.new_tag('A', href=bookmark['url']) | |
if 'add_date' in bookmark and bookmark['add_date']: | |
a['ADD_DATE'] = bookmark['add_date'] | |
if 'icon' in bookmark and bookmark['icon']: | |
a['ICON'] = bookmark['icon'] | |
a.string = bookmark['title'] | |
dt.append(a) | |
dl_cat.append(dt) | |
html_content = str(soup) | |
b64 = base64.b64encode(html_content.encode()).decode() | |
href = f'data:text/html;base64,{b64}' | |
logger.info("Bookmarks exported successfully") | |
return f''' | |
<div style="text-align: center;"> | |
<a href="{href}" | |
download="bookmarks.html" | |
style="display: inline-block; | |
padding: 10px 20px; | |
background-color: #4CAF50; | |
color: white; | |
text-decoration: none; | |
border-radius: 5px; | |
margin: 10px;"> | |
💾 Download Exported Bookmarks | |
</a> | |
</div> | |
''' | |
except Exception as e: | |
logger.error(f"Error exporting bookmarks: {e}") | |
return "⚠️ Error exporting bookmarks." | |
def chatbot_response(user_query): | |
""" | |
Generate chatbot response with enhanced context understanding. | |
""" | |
if not GROQ_API_KEY: | |
return "⚠️ API key not set. Please set the GROQ_API_KEY environment variable." | |
if not bookmarks: | |
return "⚠️ No bookmarks available. Please upload and process your bookmarks first." | |
logger.info(f"Processing query: {user_query}") | |
try: | |
# Get relevant bookmarks using FAISS | |
query_embedding = embedding_model.encode([user_query]).astype('float32') | |
k = min(5, len(bookmarks)) # Get top 5 or all if less than 5 | |
D, I = faiss_index.search(query_embedding, k) | |
relevant_bookmarks = [] | |
for idx in I[0]: | |
if idx != -1: # Valid index | |
bookmark_data = bookmarks[idx] | |
relevant_bookmarks.append({ | |
'title': bookmark_data['title'], | |
'url': bookmark_data['url'], | |
'summary': bookmark_data['summary'], | |
'category': bookmark_data['category'] | |
}) | |
# Prepare context for LLM | |
bookmark_descriptions = [] | |
for i, bm in enumerate(relevant_bookmarks, 1): | |
desc = f"{i}. Title: {bm['title']}\n URL: {bm['url']}\n Category: {bm['category']}\n Summary: {bm['summary']}" | |
bookmark_descriptions.append(desc) | |
# Precompute the joined descriptions to avoid backslashes in f-string expressions | |
joined_bookmark_descriptions = '\\n\\n'.join(bookmark_descriptions) | |
prompt = f""" | |
User Query: {user_query} | |
Relevant Bookmarks: | |
{joined_bookmark_descriptions} | |
Please provide a helpful response that: | |
1. Identifies the most relevant bookmarks for the query | |
2. Explains why each bookmark might be useful | |
3. Suggests how the user might use these resources | |
Format the response in a clear, readable way with appropriate spacing and structure. | |
""" | |
response = openai.ChatCompletion.create( | |
model='llama3-8b-8192', | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant that finds and explains relevant bookmarks."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=500, | |
temperature=0.7, | |
) | |
answer = response['choices'][0]['message']['content'].strip() | |
logger.info("Generated response successfully") | |
return answer | |
except Exception as e: | |
error_message = f"⚠️ Error processing your query: {str(e)}" | |
logger.error(error_message) | |
return error_message | |
def build_app(): | |
""" | |
Build and launch the Gradio app with enhanced UI and functionality. | |
""" | |
try: | |
logger.info("Building Gradio app") | |
with gr.Blocks(css="app.css") as demo: | |
gr.Markdown("# 📚 Bookmark Manager") | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File(label="Upload Bookmarks HTML File", file_types=["file"]) | |
process_button = gr.Button("Process Bookmarks") | |
process_message = gr.Markdown("") | |
category_dropdown = gr.Dropdown(choices=CATEGORIES, label="New Category") | |
edit_button = gr.Button("Edit Selected Bookmarks Category") | |
delete_button = gr.Button("Delete Selected Bookmarks") | |
export_button = gr.Button("Export Bookmarks") | |
# Define CheckboxGroups and assign to variables | |
delete_checkbox = gr.CheckboxGroup(label="Select Bookmarks to Delete", choices=[]) | |
edit_checkbox = gr.CheckboxGroup(label="Select Bookmarks to Edit", choices=[]) | |
with gr.Column(): | |
bookmarks_display = gr.HTML(label="Bookmarks") | |
with gr.Row(): | |
chatbot_input = gr.Textbox(label="Ask about your bookmarks", placeholder="Enter your query here...") | |
chatbot_output = gr.Textbox(label="Chatbot Response", interactive=False) | |
# Processing File | |
process_button.click( | |
fn=process_uploaded_file, | |
inputs=[file_input, delete_checkbox, edit_checkbox], | |
outputs=[process_message, bookmarks_display, delete_checkbox, edit_checkbox] | |
) | |
# Deleting Bookmarks | |
delete_button.click( | |
fn=delete_selected_bookmarks, | |
inputs=[delete_checkbox, edit_checkbox], | |
outputs=[process_message, delete_checkbox, edit_checkbox, bookmarks_display] | |
) | |
# Editing Categories | |
edit_button.click( | |
fn=edit_selected_bookmarks_category, | |
inputs=[edit_checkbox, category_dropdown, delete_checkbox, edit_checkbox], | |
outputs=[process_message, delete_checkbox, edit_checkbox, bookmarks_display] | |
) | |
# Exporting Bookmarks | |
export_button.click( | |
fn=export_bookmarks, | |
inputs=None, | |
outputs=gr.HTML(label="Export") | |
) | |
# Chatbot | |
chatbot_input.submit( | |
fn=chatbot_response, | |
inputs=chatbot_input, | |
outputs=chatbot_output | |
) | |
logger.info("Launching Gradio app") | |
demo.launch(debug=True) | |
except Exception as e: | |
logger.error(f"Error building the app: {e}") | |
print(f"Error building the app: {e}") | |
if __name__ == "__main__": | |
build_app() | |