# app.py import gradio as gr from bs4 import BeautifulSoup import requests from sentence_transformers import SentenceTransformer import faiss import numpy as np import asyncio import aiohttp import re # Initialize models and variables embedding_model = SentenceTransformer('all-MiniLM-L6-v2') faiss_index = None bookmarks = [] fetch_cache = {} # Define the categories CATEGORIES = [ "Social Media", "News and Media", "Education and Learning", "Entertainment", "Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness", "Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture", "Government and Politics", "Business and Economy", "Science and Research", "Personal Blogs and Journals", "Job Search and Careers", "Music and Audio", "Videos and Movies", "Reference and Knowledge Bases", ] def parse_bookmarks(file_content): soup = BeautifulSoup(file_content, 'html.parser') extracted_bookmarks = [] for link in soup.find_all('a'): url = link.get('href') title = link.text.strip() if url and title: extracted_bookmarks.append({'url': url, 'title': title}) return extracted_bookmarks async def fetch_url_info(session, bookmark): url = bookmark['url'] if url in fetch_cache: bookmark.update(fetch_cache[url]) return bookmark try: async with session.get(url, timeout=5) as response: bookmark['etag'] = response.headers.get('ETag', 'N/A') bookmark['status_code'] = response.status if response.status >= 400: bookmark['dead_link'] = True bookmark['description'] = '' else: bookmark['dead_link'] = False content = await response.text() soup = BeautifulSoup(content, 'html.parser') # Extract meta description or Open Graph description meta_description = soup.find('meta', attrs={'name': 'description'}) og_description = soup.find('meta', attrs={'property': 'og:description'}) if og_description and og_description.get('content'): description = og_description.get('content') elif meta_description and meta_description.get('content'): description = meta_description.get('content') else: description = '' bookmark['description'] = description except Exception as e: bookmark['dead_link'] = True bookmark['etag'] = 'N/A' bookmark['status_code'] = 'N/A' bookmark['description'] = '' finally: fetch_cache[url] = { 'etag': bookmark.get('etag'), 'status_code': bookmark.get('status_code'), 'dead_link': bookmark.get('dead_link'), 'description': bookmark.get('description'), } return bookmark async def process_bookmarks_async(bookmarks): async with aiohttp.ClientSession() as session: tasks = [] for bookmark in bookmarks: task = asyncio.ensure_future(fetch_url_info(session, bookmark)) tasks.append(task) await asyncio.gather(*tasks) def generate_summary(bookmark): description = bookmark.get('description', '') if description: bookmark['summary'] = description else: title = bookmark.get('title', '') if title: bookmark['summary'] = title else: bookmark['summary'] = 'No summary available.' return bookmark def assign_category(bookmark): summary = bookmark.get('summary', '').lower() assigned_category = 'Uncategorized' # Keywords associated with each category category_keywords = { "Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"], "News and Media": ["news", "journalism", "media", "headlines", "breaking news"], "Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"], "Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"], "Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"], "Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"], "Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"], "Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"], "Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"], "Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"], "Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"], "Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"], "Government and Politics": ["government", "politics", "policy", "election", "public service"], "Business and Economy": ["business", "corporate", "industry", "economy", "markets"], "Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"], "Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"], "Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"], "Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"], "Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"], "Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"], } for category, keywords in category_keywords.items(): for keyword in keywords: if re.search(r'\b' + re.escape(keyword) + r'\b', summary): assigned_category = category break if assigned_category != 'Uncategorized': break bookmark['category'] = assigned_category return bookmark def vectorize_and_index(bookmarks): summaries = [bookmark['summary'] for bookmark in bookmarks] embeddings = embedding_model.encode(summaries) dimension = embeddings.shape[1] faiss_idx = faiss.IndexFlatL2(dimension) faiss_idx.add(np.array(embeddings)) return faiss_idx, embeddings def display_bookmarks(): cards = '' for i, bookmark in enumerate(bookmarks): index = i + 1 # Start index at 1 status = "Dead Link" if bookmark.get('dead_link') else "Active" title = bookmark['title'] url = bookmark['url'] etag = bookmark.get('etag', 'N/A') summary = bookmark.get('summary', '') category = bookmark.get('category', 'Uncategorized') # Apply inline styles for dead links if bookmark.get('dead_link'): card_style = "" # No background color text_style = "color: #D32F2F;" # Red text else: card_style = "" text_style = "" card_html = f'''

{index}. {title}

Category: {category}

URL: {url}

Status: {status}

ETag: {etag}

Summary: {summary}

''' cards += card_html return cards def process_uploaded_file(file): global bookmarks, faiss_index if file is None: return "Please upload a bookmarks HTML file.", '' try: file_content = file.decode('utf-8') except UnicodeDecodeError: return "Error decoding the file. Please ensure it's a valid HTML file.", '' bookmarks = parse_bookmarks(file_content) if not bookmarks: return "No bookmarks found in the uploaded file.", '' # Asynchronously fetch bookmark info asyncio.run(process_bookmarks_async(bookmarks)) # Generate summaries and assign categories for bookmark in bookmarks: generate_summary(bookmark) assign_category(bookmark) faiss_index, embeddings = vectorize_and_index(bookmarks) message = f"Successfully processed {len(bookmarks)} bookmarks." bookmark_html = display_bookmarks() return message, bookmark_html def chatbot_response(user_query): if faiss_index is None or not bookmarks: return "No bookmarks available. Please upload and process your bookmarks first." # Vectorize user query user_embedding = embedding_model.encode([user_query]) D, I = faiss_index.search(np.array(user_embedding), k=5) # Retrieve top 5 matches # Generate response response = "" for idx in I[0]: if idx < len(bookmarks): bookmark = bookmarks[idx] index = bookmarks.index(bookmark) + 1 # Start index at 1 response += f"{index}. Title: {bookmark['title']}\nURL: {bookmark['url']}\nCategory: {bookmark.get('category', 'Uncategorized')}\nSummary: {bookmark['summary']}\n\n" return response.strip() def edit_bookmark(bookmark_idx, new_title, new_url): global faiss_index try: bookmark_idx = int(bookmark_idx) - 1 # Adjust index to match list (starting at 0) if bookmark_idx < 0 or bookmark_idx >= len(bookmarks): return "Invalid bookmark index.", display_bookmarks() bookmarks[bookmark_idx]['title'] = new_title bookmarks[bookmark_idx]['url'] = new_url # Re-fetch bookmark info asyncio.run(process_bookmarks_async([bookmarks[bookmark_idx]])) generate_summary(bookmarks[bookmark_idx]) assign_category(bookmarks[bookmark_idx]) # Rebuild the FAISS index faiss_index, embeddings = vectorize_and_index(bookmarks) message = "Bookmark updated successfully." updated_html = display_bookmarks() return message, updated_html except Exception as e: return f"Error: {str(e)}", display_bookmarks() def delete_bookmark(bookmark_idx): global faiss_index try: bookmark_idx = int(bookmark_idx) - 1 # Adjust index to match list (starting at 0) if bookmark_idx < 0 or bookmark_idx >= len(bookmarks): return "Invalid bookmark index.", display_bookmarks() bookmarks.pop(bookmark_idx) # Rebuild the FAISS index if bookmarks: faiss_index, embeddings = vectorize_and_index(bookmarks) else: faiss_index = None message = "Bookmark deleted successfully." updated_html = display_bookmarks() return message, updated_html except Exception as e: return f"Error: {str(e)}", display_bookmarks() def build_app(): with gr.Blocks(css="app.css") as demo: gr.Markdown("

Bookmark Manager App

") with gr.Tab("Upload and Process Bookmarks"): upload = gr.File(label="Upload Bookmarks HTML File", type='binary') process_button = gr.Button("Process Bookmarks") output_text = gr.Textbox(label="Output") bookmark_display = gr.HTML(label="Bookmarks") def update_bookmark_display(file): message, html_content = process_uploaded_file(file) return message, html_content process_button.click( update_bookmark_display, inputs=upload, outputs=[output_text, bookmark_display] ) with gr.Tab("Chat with Bookmarks"): user_input = gr.Textbox(label="Ask about your bookmarks") chat_output = gr.Textbox(label="Chatbot Response") chat_button = gr.Button("Send") chat_button.click( chatbot_response, inputs=user_input, outputs=chat_output ) with gr.Tab("Manage Bookmarks"): manage_output = gr.Textbox(label="Manage Output") bookmark_display_manage = gr.HTML(label="Bookmarks") refresh_button = gr.Button("Refresh Bookmark List") with gr.Row(): index_input = gr.Number(label="Bookmark Index (Starting from 1)", precision=0) new_title_input = gr.Textbox(label="New Title") new_url_input = gr.Textbox(label="New URL") edit_button = gr.Button("Edit Bookmark") delete_button = gr.Button("Delete Bookmark") def update_manage_display(): html_content = display_bookmarks() return html_content refresh_button.click( update_manage_display, inputs=None, outputs=bookmark_display_manage ) edit_button.click( edit_bookmark, inputs=[index_input, new_title_input, new_url_input], outputs=[manage_output, bookmark_display_manage] ) delete_button.click( delete_bookmark, inputs=index_input, outputs=[manage_output, bookmark_display_manage] ) # Initial load of the bookmarks display bookmark_display_manage.value = update_manage_display() demo.launch() if __name__ == "__main__": build_app()