# app.py import gradio as gr from bs4 import BeautifulSoup import requests from sentence_transformers import SentenceTransformer import faiss import numpy as np import asyncio import aiohttp import re import pandas as pd # Initialize models and variables embedding_model = SentenceTransformer('all-MiniLM-L6-v2') faiss_index = None bookmarks = [] fetch_cache = {} # Define the categories CATEGORIES = [ "Social Media", "News and Media", "Education and Learning", "Entertainment", "Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness", "Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture", "Government and Politics", "Business and Economy", "Science and Research", "Personal Blogs and Journals", "Job Search and Careers", "Music and Audio", "Videos and Movies", "Reference and Knowledge Bases", "Dead Link", "Uncategorized", ] def parse_bookmarks(file_content): soup = BeautifulSoup(file_content, 'html.parser') extracted_bookmarks = [] for link in soup.find_all('a'): url = link.get('href') title = link.text.strip() if url and title: extracted_bookmarks.append({'url': url, 'title': title}) return extracted_bookmarks async def fetch_url_info(session, bookmark): url = bookmark['url'] if url in fetch_cache: bookmark.update(fetch_cache[url]) return bookmark try: async with session.get(url, timeout=5) as response: bookmark['etag'] = response.headers.get('ETag', 'N/A') bookmark['status_code'] = response.status if response.status >= 400: bookmark['dead_link'] = True bookmark['description'] = '' else: bookmark['dead_link'] = False content = await response.text() soup = BeautifulSoup(content, 'html.parser') # Extract meta description or Open Graph description meta_description = soup.find('meta', attrs={'name': 'description'}) og_description = soup.find('meta', attrs={'property': 'og:description'}) if og_description and og_description.get('content'): description = og_description.get('content') elif meta_description and meta_description.get('content'): description = meta_description.get('content') else: description = '' bookmark['description'] = description except Exception as e: bookmark['dead_link'] = True bookmark['etag'] = 'N/A' bookmark['status_code'] = 'N/A' bookmark['description'] = '' finally: fetch_cache[url] = { 'etag': bookmark.get('etag'), 'status_code': bookmark.get('status_code'), 'dead_link': bookmark.get('dead_link'), 'description': bookmark.get('description'), } return bookmark async def process_bookmarks_async(bookmarks): async with aiohttp.ClientSession() as session: tasks = [] for bookmark in bookmarks: task = asyncio.ensure_future(fetch_url_info(session, bookmark)) tasks.append(task) await asyncio.gather(*tasks) def generate_summary(bookmark): description = bookmark.get('description', '') if description: bookmark['summary'] = description else: title = bookmark.get('title', '') if title: bookmark['summary'] = title else: bookmark['summary'] = 'No summary available.' return bookmark def assign_category(bookmark): if bookmark.get('dead_link'): bookmark['category'] = 'Dead Link' return bookmark summary = bookmark.get('summary', '').lower() assigned_category = 'Uncategorized' # Keywords associated with each category category_keywords = { "Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"], "News and Media": ["news", "journalism", "media", "headlines", "breaking news"], "Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"], "Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"], "Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"], "Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"], "Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"], "Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"], "Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"], "Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"], "Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"], "Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"], "Government and Politics": ["government", "politics", "policy", "election", "public service"], "Business and Economy": ["business", "corporate", "industry", "economy", "markets"], "Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"], "Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"], "Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"], "Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"], "Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"], "Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"], } for category, keywords in category_keywords.items(): for keyword in keywords: if re.search(r'\b' + re.escape(keyword) + r'\b', summary): assigned_category = category break if assigned_category != 'Uncategorized': break bookmark['category'] = assigned_category return bookmark def vectorize_and_index(bookmarks): summaries = [bookmark['summary'] for bookmark in bookmarks] embeddings = embedding_model.encode(summaries) dimension = embeddings.shape[1] faiss_idx = faiss.IndexFlatL2(dimension) faiss_idx.add(np.array(embeddings)) return faiss_idx, embeddings def bookmarks_to_dataframe(): data = [] for i, bookmark in enumerate(bookmarks): index = i + 1 status = "Dead Link" if bookmark.get('dead_link') else "Active" data.append({ 'Index': index, 'Title': bookmark['title'], 'URL': bookmark['url'], 'Category': bookmark.get('category', 'Uncategorized'), 'Status': status, 'Summary': bookmark.get('summary', ''), }) df = pd.DataFrame(data) return df def process_uploaded_file(file): global bookmarks, faiss_index if file is None: return "Please upload a bookmarks HTML file.", pd.DataFrame() try: file_content = file.decode('utf-8') except UnicodeDecodeError: return "Error decoding the file. Please ensure it's a valid HTML file.", pd.DataFrame() bookmarks = parse_bookmarks(file_content) if not bookmarks: return "No bookmarks found in the uploaded file.", pd.DataFrame() # Asynchronously fetch bookmark info asyncio.run(process_bookmarks_async(bookmarks)) # Generate summaries and assign categories for bookmark in bookmarks: generate_summary(bookmark) assign_category(bookmark) faiss_index, embeddings = vectorize_and_index(bookmarks) message = f"Successfully processed {len(bookmarks)} bookmarks." bookmark_df = bookmarks_to_dataframe() return message, bookmark_df def chatbot_response(user_query): if faiss_index is None or not bookmarks: return "No bookmarks available. Please upload and process your bookmarks first." # Vectorize user query user_embedding = embedding_model.encode([user_query]) D, I = faiss_index.search(np.array(user_embedding), k=5) # Retrieve top 5 matches # Generate response response = "" for idx in I[0]: if idx < len(bookmarks): bookmark = bookmarks[idx] index = idx + 1 # Start index at 1 response += f"{index}. Title: {bookmark['title']}\nURL: {bookmark['url']}\nCategory: {bookmark.get('category', 'Uncategorized')}\nSummary: {bookmark['summary']}\n\n" return response.strip() def edit_bookmark(row): global faiss_index try: bookmark_idx = int(row['Index']) - 1 # Adjust index to match list (starting at 0) if bookmark_idx < 0 or bookmark_idx >= len(bookmarks): return "Invalid bookmark index.", bookmarks_to_dataframe() bookmarks[bookmark_idx]['title'] = row['Title'] bookmarks[bookmark_idx]['url'] = row['URL'] bookmarks[bookmark_idx]['category'] = row['Category'] # Re-fetch bookmark info asyncio.run(process_bookmarks_async([bookmarks[bookmark_idx]])) generate_summary(bookmarks[bookmark_idx]) # Rebuild the FAISS index faiss_index, embeddings = vectorize_and_index(bookmarks) message = "Bookmark updated successfully." updated_df = bookmarks_to_dataframe() return message, updated_df except Exception as e: return f"Error: {str(e)}", bookmarks_to_dataframe() def delete_bookmarks(selected_indices): global faiss_index try: indices = sorted([int(idx) - 1 for idx in selected_indices], reverse=True) for idx in indices: if 0 <= idx < len(bookmarks): bookmarks.pop(idx) # Rebuild the FAISS index if bookmarks: faiss_index, embeddings = vectorize_and_index(bookmarks) else: faiss_index = None message = "Selected bookmarks deleted successfully." updated_df = bookmarks_to_dataframe() return message, updated_df except Exception as e: return f"Error: {str(e)}", bookmarks_to_dataframe() def export_bookmarks(): if not bookmarks: return None # Create an HTML content similar to the imported bookmarks file soup = BeautifulSoup("