# app.py import gradio as gr from bs4 import BeautifulSoup import requests from sentence_transformers import SentenceTransformer import faiss import numpy as np import asyncio import aiohttp import re # Initialize models and variables embedding_model = SentenceTransformer('all-MiniLM-L6-v2') faiss_index = None bookmarks = [] fetch_cache = {} # Define the categories CATEGORIES = [ "Social Media", "News and Media", "Education and Learning", "Entertainment", "Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness", "Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture", "Government and Politics", "Business and Economy", "Science and Research", "Personal Blogs and Journals", "Job Search and Careers", "Music and Audio", "Videos and Movies", "Reference and Knowledge Bases", ] def parse_bookmarks(file_content): soup = BeautifulSoup(file_content, 'html.parser') extracted_bookmarks = [] for link in soup.find_all('a'): url = link.get('href') title = link.text.strip() if url and title: extracted_bookmarks.append({'url': url, 'title': title}) return extracted_bookmarks async def fetch_url_info(session, bookmark): url = bookmark['url'] if url in fetch_cache: bookmark.update(fetch_cache[url]) return bookmark try: async with session.get(url, timeout=5) as response: bookmark['etag'] = response.headers.get('ETag', 'N/A') bookmark['status_code'] = response.status if response.status >= 400: bookmark['dead_link'] = True bookmark['description'] = '' else: bookmark['dead_link'] = False content = await response.text() soup = BeautifulSoup(content, 'html.parser') # Extract meta description or Open Graph description meta_description = soup.find('meta', attrs={'name': 'description'}) og_description = soup.find('meta', attrs={'property': 'og:description'}) if og_description and og_description.get('content'): description = og_description.get('content') elif meta_description and meta_description.get('content'): description = meta_description.get('content') else: description = '' bookmark['description'] = description except Exception as e: bookmark['dead_link'] = True bookmark['etag'] = 'N/A' bookmark['status_code'] = 'N/A' bookmark['description'] = '' finally: fetch_cache[url] = { 'etag': bookmark.get('etag'), 'status_code': bookmark.get('status_code'), 'dead_link': bookmark.get('dead_link'), 'description': bookmark.get('description'), } return bookmark async def process_bookmarks_async(bookmarks): async with aiohttp.ClientSession() as session: tasks = [] for bookmark in bookmarks: task = asyncio.ensure_future(fetch_url_info(session, bookmark)) tasks.append(task) await asyncio.gather(*tasks) def generate_summary(bookmark): description = bookmark.get('description', '') if description: bookmark['summary'] = description else: title = bookmark.get('title', '') if title: bookmark['summary'] = title else: bookmark['summary'] = 'No summary available.' return bookmark def assign_category(bookmark): summary = bookmark.get('summary', '').lower() assigned_category = 'Uncategorized' # Keywords associated with each category category_keywords = { "Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"], "News and Media": ["news", "journalism", "media", "headlines", "breaking news"], "Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"], "Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"], "Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"], "Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"], "Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"], "Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"], "Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"], "Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"], "Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"], "Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"], "Government and Politics": ["government", "politics", "policy", "election", "public service"], "Business and Economy": ["business", "corporate", "industry", "economy", "markets"], "Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"], "Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"], "Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"], "Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"], "Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"], "Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"], } for category, keywords in category_keywords.items(): for keyword in keywords: if re.search(r'\b' + re.escape(keyword) + r'\b', summary): assigned_category = category break if assigned_category != 'Uncategorized': break bookmark['category'] = assigned_category return bookmark def vectorize_and_index(bookmarks): summaries = [bookmark['summary'] for bookmark in bookmarks] embeddings = embedding_model.encode(summaries) dimension = embeddings.shape[1] faiss_idx = faiss.IndexFlatL2(dimension) faiss_idx.add(np.array(embeddings)) return faiss_idx, embeddings def display_bookmarks(): cards = '' for i, bookmark in enumerate(bookmarks): index = i + 1 # Start index at 1 status = "Dead Link" if bookmark.get('dead_link') else "Active" title = bookmark['title'] url = bookmark['url'] etag = bookmark.get('etag', 'N/A') summary = bookmark.get('summary', '') category = bookmark.get('category', 'Uncategorized') # Apply inline styles for dead links if bookmark.get('dead_link'): card_style = "" # No background color text_style = "color: #D32F2F;" # Red text else: card_style = "" text_style = "" card_html = f'''