import os import time import threading import requests from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer import faiss import numpy as np import gradio as gr from concurrent.futures import ThreadPoolExecutor import logging # Suppress warnings from urllib3 import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Logging setup logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Environment variable keys for API access GROQ_API_KEY_BASIC = os.getenv('GROQ_API_KEY_BASIC') GROQ_API_KEY_ADVANCED = os.getenv('GROQ_API_KEY_ADVANCED') # LLM Models MODEL_BASIC = 'llama-3.1-8b-instant' MODEL_ADVANCED = 'llama-3.1-70b-versatile' # Verify API keys if not GROQ_API_KEY_BASIC or not GROQ_API_KEY_ADVANCED: logger.error("Both GROQ_API_KEY_BASIC and GROQ_API_KEY_ADVANCED must be set.") exit() # Embedding model and FAISS index initialization embedding_model = SentenceTransformer('all-MiniLM-L6-v2') faiss_index = None bookmarks = [] # Define categories CATEGORIES = [ "Social Media", "News and Media", "Education and Learning", "Entertainment", "Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness", "Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture", "Government and Politics", "Business and Economy", "Science and Research", "Personal Blogs and Journals", "Job Search and Careers", "Music and Audio", "Videos and Movies", "Reference and Knowledge Bases", "Dead Link", "Uncategorized" ] # Task routing logic def select_model_for_task(content_length): """Choose LLM model based on task complexity.""" if content_length < 500: # Simple tasks return GROQ_API_KEY_BASIC, MODEL_BASIC else: # Complex tasks return GROQ_API_KEY_ADVANCED, MODEL_ADVANCED # Fetch URL info function def fetch_url_info(bookmark): try: response = requests.get(bookmark['url'], timeout=10, verify=False) bookmark['html_content'] = response.text bookmark['status_code'] = response.status_code except Exception as e: logger.error(f"Failed to fetch URL info for {bookmark['url']}: {e}") bookmark['html_content'] = '' bookmark['status_code'] = 'Error' # Generate summary and assign category def generate_summary_and_assign_category(bookmark): content_length = len(bookmark.get('html_content', '')) api_key, model_name = select_model_for_task(content_length) # Prepare the prompt prompt = f""" You are an assistant. Summarize the following webpage content: {bookmark.get('html_content', '')} Assign one category from this list: {', '.join(CATEGORIES)}. Respond in the format: Summary: [Your summary] Category: [One category] """ try: response = requests.post( f"https://api.openai.com/v1/chat/completions", headers={"Authorization": f"Bearer {api_key}"}, json={ "model": model_name, "messages": [{"role": "user", "content": prompt}], "max_tokens": 150, "temperature": 0.7, }, ) result = response.json() content = result['choices'][0]['message']['content'] # Extract summary and category summary_start = content.find("Summary:") category_start = content.find("Category:") bookmark['summary'] = content[summary_start + 9:category_start].strip() bookmark['category'] = content[category_start + 9:].strip() except Exception as e: logger.error(f"Error processing LLM response for {bookmark['url']}: {e}") bookmark['summary'] = 'No summary available.' bookmark['category'] = 'Uncategorized' # Vectorize summaries and build FAISS index def vectorize_and_index(bookmarks): global faiss_index summaries = [b['summary'] for b in bookmarks] embeddings = embedding_model.encode(summaries) dimension = embeddings.shape[1] index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension)) ids = np.arange(len(bookmarks)) index.add_with_ids(embeddings, ids) faiss_index = index # Gradio interface setup def process_bookmarks(file): global bookmarks file_content = file.read().decode('utf-8') soup = BeautifulSoup(file_content, 'html.parser') # Parse bookmarks bookmarks = [ {'url': link.get('href'), 'title': link.text, 'html_content': ''} for link in soup.find_all('a') if link.get('href') ] # Fetch URLs concurrently with ThreadPoolExecutor() as executor: executor.map(fetch_url_info, bookmarks) # Process bookmarks with LLM with ThreadPoolExecutor() as executor: executor.map(generate_summary_and_assign_category, bookmarks) # Build FAISS index vectorize_and_index(bookmarks) return bookmarks # Build Gradio app with gr.Blocks() as demo: gr.Markdown("# Smart Bookmark Manager") file_input = gr.File(label="Upload Bookmark File", type="binary") submit_button = gr.Button("Process") output = gr.Textbox(label="Output") def handle_submit(file): processed = process_bookmarks(file) return "\n".join([f"{b['title']} - {b['category']}" for b in processed]) submit_button.click(handle_submit, inputs=file_input, outputs=output) demo.launch()