Spaces:
Sleeping
Sleeping
# app.py | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
import requests | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import asyncio | |
import aiohttp | |
import re | |
import base64 | |
import logging | |
import os | |
# Import Hugging Face transformers | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
# Set up logging | |
logging.basicConfig(filename='app.log', level=logging.INFO, | |
format='%(asctime)s %(levelname)s %(name)s %(message)s') | |
logger = logging.getLogger(__name__) | |
# Initialize models and variables | |
logger.info("Initializing models and variables") | |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
faiss_index = None | |
bookmarks = [] | |
fetch_cache = {} | |
# Define the categories | |
CATEGORIES = [ | |
"Social Media", | |
"News and Media", | |
"Education and Learning", | |
"Entertainment", | |
"Shopping and E-commerce", | |
"Finance and Banking", | |
"Technology", | |
"Health and Fitness", | |
"Travel and Tourism", | |
"Food and Recipes", | |
"Sports", | |
"Arts and Culture", | |
"Government and Politics", | |
"Business and Economy", | |
"Science and Research", | |
"Personal Blogs and Journals", | |
"Job Search and Careers", | |
"Music and Audio", | |
"Videos and Movies", | |
"Reference and Knowledge Bases", | |
"Dead Link", | |
"Uncategorized", | |
] | |
# Load FLAN-T5 model and tokenizer | |
logger.info("Loading FLAN-T5 model and tokenizer") | |
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small') | |
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small') | |
# Function to parse bookmarks from HTML | |
def parse_bookmarks(file_content): | |
logger.info("Parsing bookmarks") | |
try: | |
soup = BeautifulSoup(file_content, 'html.parser') | |
extracted_bookmarks = [] | |
for link in soup.find_all('a'): | |
url = link.get('href') | |
title = link.text.strip() | |
if url and title: | |
extracted_bookmarks.append({'url': url, 'title': title}) | |
logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks") | |
return extracted_bookmarks | |
except Exception as e: | |
logger.error("Error parsing bookmarks: %s", e) | |
raise | |
# Asynchronous function to fetch URL info | |
async def fetch_url_info(session, bookmark): | |
url = bookmark['url'] | |
if url in fetch_cache: | |
bookmark.update(fetch_cache[url]) | |
return bookmark | |
try: | |
logger.info(f"Fetching URL info for: {url}") | |
async with session.get(url, timeout=5) as response: | |
bookmark['etag'] = response.headers.get('ETag', 'N/A') | |
bookmark['status_code'] = response.status | |
if response.status >= 400: | |
bookmark['dead_link'] = True | |
bookmark['description'] = '' | |
logger.warning(f"Dead link detected: {url} with status {response.status}") | |
else: | |
bookmark['dead_link'] = False | |
content = await response.text() | |
soup = BeautifulSoup(content, 'html.parser') | |
# Extract meta description or Open Graph description | |
meta_description = soup.find('meta', attrs={'name': 'description'}) | |
og_description = soup.find('meta', attrs={'property': 'og:description'}) | |
if og_description and og_description.get('content'): | |
description = og_description.get('content') | |
elif meta_description and meta_description.get('content'): | |
description = meta_description.get('content') | |
else: | |
description = '' | |
bookmark['description'] = description | |
logger.info(f"Fetched description for {url}") | |
except Exception as e: | |
bookmark['dead_link'] = True | |
bookmark['etag'] = 'N/A' | |
bookmark['status_code'] = 'N/A' | |
bookmark['description'] = '' | |
logger.error(f"Error fetching URL info for {url}: {e}") | |
finally: | |
fetch_cache[url] = { | |
'etag': bookmark.get('etag'), | |
'status_code': bookmark.get('status_code'), | |
'dead_link': bookmark.get('dead_link'), | |
'description': bookmark.get('description'), | |
} | |
return bookmark | |
# Asynchronous processing of bookmarks | |
async def process_bookmarks_async(bookmarks): | |
logger.info("Processing bookmarks asynchronously") | |
try: | |
async with aiohttp.ClientSession() as session: | |
tasks = [] | |
for bookmark in bookmarks: | |
task = asyncio.ensure_future(fetch_url_info(session, bookmark)) | |
tasks.append(task) | |
await asyncio.gather(*tasks) | |
logger.info("Completed processing bookmarks asynchronously") | |
except Exception as e: | |
logger.error(f"Error in asynchronous processing of bookmarks: {e}") | |
raise | |
# Generate summary for a bookmark | |
def generate_summary(bookmark): | |
description = bookmark.get('description', '') | |
if description: | |
bookmark['summary'] = description | |
else: | |
title = bookmark.get('title', '') | |
if title: | |
bookmark['summary'] = title | |
else: | |
bookmark['summary'] = 'No summary available.' | |
logger.info(f"Generated summary for bookmark: {bookmark.get('url')}") | |
return bookmark | |
# Assign category to a bookmark | |
def assign_category(bookmark): | |
if bookmark.get('dead_link'): | |
bookmark['category'] = 'Dead Link' | |
logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}") | |
return bookmark | |
summary = bookmark.get('summary', '').lower() | |
assigned_category = 'Uncategorized' | |
# Keywords associated with each category | |
category_keywords = { | |
"Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"], | |
"News and Media": ["news", "journalism", "media", "headlines", "breaking news"], | |
"Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"], | |
"Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"], | |
"Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"], | |
"Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"], | |
"Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"], | |
"Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"], | |
"Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"], | |
"Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"], | |
"Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"], | |
"Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"], | |
"Government and Politics": ["government", "politics", "policy", "election", "public service"], | |
"Business and Economy": ["business", "corporate", "industry", "economy", "markets"], | |
"Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"], | |
"Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"], | |
"Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"], | |
"Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"], | |
"Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"], | |
"Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"], | |
} | |
for category, keywords in category_keywords.items(): | |
for keyword in keywords: | |
if re.search(r'\b' + re.escape(keyword) + r'\b', summary): | |
assigned_category = category | |
logger.info(f"Assigned category '{assigned_category}' to bookmark: {bookmark.get('url')}") | |
break | |
if assigned_category != 'Uncategorized': | |
break | |
bookmark['category'] = assigned_category | |
if assigned_category == 'Uncategorized': | |
logger.info(f"No matching category found for bookmark: {bookmark.get('url')}") | |
return bookmark | |
# Vectorize summaries and build FAISS index | |
def vectorize_and_index(bookmarks): | |
logger.info("Vectorizing summaries and building FAISS index") | |
try: | |
summaries = [bookmark['summary'] for bookmark in bookmarks] | |
embeddings = embedding_model.encode(summaries) | |
dimension = embeddings.shape[1] | |
faiss_idx = faiss.IndexFlatL2(dimension) | |
faiss_idx.add(np.array(embeddings)) | |
logger.info("FAISS index built successfully") | |
return faiss_idx, embeddings | |
except Exception as e: | |
logger.error(f"Error in vectorizing and indexing: {e}") | |
raise | |
# Generate HTML display for bookmarks | |
def display_bookmarks(): | |
logger.info("Generating HTML display for bookmarks") | |
cards = '' | |
for i, bookmark in enumerate(bookmarks): | |
index = i + 1 # Start index at 1 | |
status = "Dead Link" if bookmark.get('dead_link') else "Active" | |
title = bookmark['title'] | |
url = bookmark['url'] | |
etag = bookmark.get('etag', 'N/A') | |
summary = bookmark.get('summary', '') | |
category = bookmark.get('category', 'Uncategorized') | |
# Apply inline styles for dead links | |
if bookmark.get('dead_link'): | |
card_style = "border: 2px solid #D32F2F;" | |
text_style = "color: #D32F2F;" | |
else: | |
card_style = "" | |
text_style = "" | |
card_html = f''' | |
<div class="card" style="{card_style}"> | |
<div class="card-content"> | |
<h3 style="{text_style}">{index}. {title}</h3> | |
<p style="{text_style}"><strong>Category:</strong> {category}</p> | |
<p style="{text_style}"><strong>URL:</strong> <a href="{url}" target="_blank" style="{text_style}">{url}</a></p> | |
<p style="{text_style}"><strong>Status:</strong> {status}</p> | |
<p style="{text_style}"><strong>ETag:</strong> {etag}</p> | |
<p style="{text_style}"><strong>Summary:</strong> {summary}</p> | |
</div> | |
</div> | |
''' | |
cards += card_html | |
logger.info("HTML display generated") | |
return cards | |
# Process the uploaded file | |
def process_uploaded_file(file): | |
global bookmarks, faiss_index | |
logger.info("Processing uploaded file") | |
if file is None: | |
logger.warning("No file uploaded") | |
return "Please upload a bookmarks HTML file.", '' | |
try: | |
file_content = file.decode('utf-8') | |
except UnicodeDecodeError as e: | |
logger.error(f"Error decoding the file: {e}") | |
return "Error decoding the file. Please ensure it's a valid HTML file.", '' | |
try: | |
bookmarks = parse_bookmarks(file_content) | |
except Exception as e: | |
logger.error(f"Error parsing bookmarks: {e}") | |
return "Error parsing the bookmarks HTML file.", '' | |
if not bookmarks: | |
logger.warning("No bookmarks found in the uploaded file") | |
return "No bookmarks found in the uploaded file.", '' | |
# Asynchronously fetch bookmark info | |
try: | |
asyncio.run(process_bookmarks_async(bookmarks)) | |
except Exception as e: | |
logger.error(f"Error processing bookmarks asynchronously: {e}") | |
return "Error processing bookmarks.", '' | |
# Generate summaries and assign categories | |
for bookmark in bookmarks: | |
generate_summary(bookmark) | |
assign_category(bookmark) | |
try: | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
except Exception as e: | |
logger.error(f"Error building FAISS index: {e}") | |
return "Error building search index.", '' | |
message = f"Successfully processed {len(bookmarks)} bookmarks." | |
logger.info(message) | |
bookmark_html = display_bookmarks() | |
return message, bookmark_html | |
# Chatbot response using Hugging Face model | |
def chatbot_response(user_query): | |
if not bookmarks: | |
logger.warning("No bookmarks available for chatbot") | |
return "No bookmarks available. Please upload and process your bookmarks first." | |
logger.info(f"Chatbot received query: {user_query}") | |
# Prepare the context | |
try: | |
# Combine bookmark summaries into context | |
max_bookmarks = 50 # Adjust as needed | |
bookmark_context = "" | |
for idx, bookmark in enumerate(bookmarks[:max_bookmarks]): | |
bookmark_context += f"{idx+1}. Title: {bookmark['title']}\nSummary: {bookmark['summary']}\n\n" | |
# Construct the prompt | |
prompt = f"Based on the following bookmarks, answer the user's query.\n\nUser query: {user_query}\n\nBookmarks:\n{bookmark_context}" | |
# Generate response | |
inputs = tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True) | |
outputs = model.generate(**inputs, max_length=512) | |
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
logger.info("Chatbot response generated using FLAN-T5 model") | |
return generated_text.strip() | |
except Exception as e: | |
logger.error(f"Error in chatbot response generation: {e}") | |
return "Error processing your query." | |
# Edit a bookmark | |
def edit_bookmark(bookmark_idx, new_title, new_url, new_category): | |
global faiss_index | |
try: | |
bookmark_idx = int(bookmark_idx) - 1 # Adjust index to match list (starting at 0) | |
if bookmark_idx < 0 or bookmark_idx >= len(bookmarks): | |
logger.warning(f"Invalid bookmark index for editing: {bookmark_idx + 1}") | |
return "Invalid bookmark index.", display_bookmarks() | |
logger.info(f"Editing bookmark at index {bookmark_idx + 1}") | |
bookmarks[bookmark_idx]['title'] = new_title | |
bookmarks[bookmark_idx]['url'] = new_url | |
bookmarks[bookmark_idx]['category'] = new_category | |
# Re-fetch bookmark info | |
asyncio.run(process_bookmarks_async([bookmarks[bookmark_idx]])) | |
generate_summary(bookmarks[bookmark_idx]) | |
# Rebuild the FAISS index | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
message = "Bookmark updated successfully." | |
logger.info(message) | |
updated_html = display_bookmarks() | |
return message, updated_html | |
except Exception as e: | |
logger.error(f"Error editing bookmark: {e}") | |
return f"Error: {str(e)}", display_bookmarks() | |
# Delete selected bookmarks | |
def delete_bookmarks(indices_str): | |
global faiss_index | |
try: | |
indices = [int(idx.strip()) - 1 for idx in indices_str.split(',') if idx.strip().isdigit()] | |
indices = sorted(indices, reverse=True) | |
logger.info(f"Deleting bookmarks at indices: {indices}") | |
for idx in indices: | |
if 0 <= idx < len(bookmarks): | |
logger.info(f"Deleting bookmark at index {idx + 1}") | |
bookmarks.pop(idx) | |
# Rebuild the FAISS index | |
if bookmarks: | |
faiss_index, embeddings = vectorize_and_index(bookmarks) | |
else: | |
faiss_index = None | |
message = "Selected bookmarks deleted successfully." | |
logger.info(message) | |
updated_html = display_bookmarks() | |
return message, updated_html | |
except Exception as e: | |
logger.error(f"Error deleting bookmarks: {e}") | |
return f"Error: {str(e)}", display_bookmarks() | |
# Export bookmarks to HTML | |
def export_bookmarks(): | |
if not bookmarks: | |
logger.warning("No bookmarks to export") | |
return None | |
try: | |
logger.info("Exporting bookmarks to HTML") | |
# Create an HTML content similar to the imported bookmarks file | |
soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1><Title>Bookmarks</Title><H1>Bookmarks</H1>", 'html.parser') | |
dl = soup.new_tag('DL') | |
for bookmark in bookmarks: | |
dt = soup.new_tag('DT') | |
a = soup.new_tag('A', href=bookmark['url']) | |
a.string = bookmark['title'] | |
dt.append(a) | |
dl.append(dt) | |
soup.append(dl) | |
html_content = str(soup) | |
# Encode the HTML content to base64 for download | |
b64 = base64.b64encode(html_content.encode()).decode() | |
href = f'data:text/html;base64,{b64}' | |
logger.info("Bookmarks exported successfully") | |
return href | |
except Exception as e: | |
logger.error(f"Error exporting bookmarks: {e}") | |
return None | |
# Build the Gradio app | |
def build_app(): | |
logger.info("Building Gradio app") | |
with gr.Blocks(css="app.css") as demo: | |
gr.Markdown("<h1>Bookmark Manager App</h1>") | |
with gr.Tab("Upload and Process Bookmarks"): | |
upload = gr.File(label="Upload Bookmarks HTML File", type='binary') | |
process_button = gr.Button("Process Bookmarks") | |
output_text = gr.Textbox(label="Output") | |
bookmark_display = gr.HTML(label="Bookmarks") | |
def update_bookmark_display(file): | |
return process_uploaded_file(file) | |
process_button.click( | |
update_bookmark_display, | |
inputs=upload, | |
outputs=[output_text, bookmark_display] | |
) | |
with gr.Tab("Chat with Bookmarks"): | |
user_input = gr.Textbox(label="Ask about your bookmarks") | |
chat_output = gr.Textbox(label="Chatbot Response") | |
chat_button = gr.Button("Send") | |
chat_button.click( | |
chatbot_response, | |
inputs=user_input, | |
outputs=chat_output | |
) | |
with gr.Tab("Manage Bookmarks"): | |
manage_output = gr.Textbox(label="Manage Output") | |
bookmark_display_manage = gr.HTML(label="Bookmarks") | |
refresh_button = gr.Button("Refresh Bookmark List") | |
indices_input = gr.Textbox(label="Bookmark Indices to Delete (comma-separated)") | |
delete_button = gr.Button("Delete Selected Bookmarks") | |
export_button = gr.Button("Export Bookmarks") | |
download_link = gr.HTML(label="Download Exported Bookmarks") | |
with gr.Row(): | |
index_input = gr.Number(label="Bookmark Index (Starting from 1)", precision=0) | |
new_title_input = gr.Textbox(label="New Title") | |
new_url_input = gr.Textbox(label="New URL") | |
new_category_input = gr.Dropdown(label="New Category", choices=CATEGORIES) | |
edit_button = gr.Button("Edit Bookmark") | |
def update_manage_display(): | |
return display_bookmarks() | |
refresh_button.click( | |
update_manage_display, | |
inputs=None, | |
outputs=bookmark_display_manage | |
) | |
edit_button.click( | |
edit_bookmark, | |
inputs=[index_input, new_title_input, new_url_input, new_category_input], | |
outputs=[manage_output, bookmark_display_manage] | |
) | |
delete_button.click( | |
delete_bookmarks, | |
inputs=indices_input, | |
outputs=[manage_output, bookmark_display_manage] | |
) | |
def provide_download_link(): | |
href = export_bookmarks() | |
if href: | |
return f'<a href="{href}" download="bookmarks.html">Download Exported Bookmarks</a>' | |
else: | |
return "No bookmarks to export." | |
export_button.click( | |
provide_download_link, | |
inputs=None, | |
outputs=download_link | |
) | |
# Initial load of the bookmarks display | |
bookmark_display_manage.value = update_manage_display() | |
logger.info("Launching Gradio app") | |
demo.launch() | |
if __name__ == "__main__": | |
build_app() | |