Spaces:
Sleeping
Sleeping
import os | |
import time | |
import threading | |
import requests | |
from bs4 import BeautifulSoup | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import gradio as gr | |
from concurrent.futures import ThreadPoolExecutor | |
import logging | |
# Suppress warnings from urllib3 | |
import urllib3 | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
# Logging setup | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Environment variable keys for API access | |
GROQ_API_KEY_BASIC = os.getenv('GROQ_API_KEY_BASIC') | |
GROQ_API_KEY_ADVANCED = os.getenv('GROQ_API_KEY_ADVANCED') | |
# LLM Models | |
MODEL_BASIC = 'llama-3.1-8b-instant' | |
MODEL_ADVANCED = 'llama-3.1-70b-versatile' | |
# Verify API keys | |
if not GROQ_API_KEY_BASIC or not GROQ_API_KEY_ADVANCED: | |
logger.error("Both GROQ_API_KEY_BASIC and GROQ_API_KEY_ADVANCED must be set.") | |
exit() | |
# Embedding model and FAISS index initialization | |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
faiss_index = None | |
bookmarks = [] | |
# Define categories | |
CATEGORIES = [ | |
"Social Media", "News and Media", "Education and Learning", "Entertainment", | |
"Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness", | |
"Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture", | |
"Government and Politics", "Business and Economy", "Science and Research", | |
"Personal Blogs and Journals", "Job Search and Careers", "Music and Audio", | |
"Videos and Movies", "Reference and Knowledge Bases", "Dead Link", "Uncategorized" | |
] | |
# Task routing logic | |
def select_model_for_task(content_length): | |
"""Choose LLM model based on task complexity.""" | |
if content_length < 500: # Simple tasks | |
return GROQ_API_KEY_BASIC, MODEL_BASIC | |
else: # Complex tasks | |
return GROQ_API_KEY_ADVANCED, MODEL_ADVANCED | |
# Fetch URL info function | |
def fetch_url_info(bookmark): | |
try: | |
response = requests.get(bookmark['url'], timeout=10, verify=False) | |
bookmark['html_content'] = response.text | |
bookmark['status_code'] = response.status_code | |
except Exception as e: | |
logger.error(f"Failed to fetch URL info for {bookmark['url']}: {e}") | |
bookmark['html_content'] = '' | |
bookmark['status_code'] = 'Error' | |
# Generate summary and assign category | |
def generate_summary_and_assign_category(bookmark): | |
content_length = len(bookmark.get('html_content', '')) | |
api_key, model_name = select_model_for_task(content_length) | |
# Prepare the prompt | |
prompt = f""" | |
You are an assistant. Summarize the following webpage content: | |
{bookmark.get('html_content', '')} | |
Assign one category from this list: {', '.join(CATEGORIES)}. | |
Respond in the format: | |
Summary: [Your summary] | |
Category: [One category] | |
""" | |
try: | |
response = requests.post( | |
f"https://api.openai.com/v1/chat/completions", | |
headers={"Authorization": f"Bearer {api_key}"}, | |
json={ | |
"model": model_name, | |
"messages": [{"role": "user", "content": prompt}], | |
"max_tokens": 150, | |
"temperature": 0.7, | |
}, | |
) | |
result = response.json() | |
content = result['choices'][0]['message']['content'] | |
# Extract summary and category | |
summary_start = content.find("Summary:") | |
category_start = content.find("Category:") | |
bookmark['summary'] = content[summary_start + 9:category_start].strip() | |
bookmark['category'] = content[category_start + 9:].strip() | |
except Exception as e: | |
logger.error(f"Error processing LLM response for {bookmark['url']}: {e}") | |
bookmark['summary'] = 'No summary available.' | |
bookmark['category'] = 'Uncategorized' | |
# Vectorize summaries and build FAISS index | |
def vectorize_and_index(bookmarks): | |
global faiss_index | |
summaries = [b['summary'] for b in bookmarks] | |
embeddings = embedding_model.encode(summaries) | |
dimension = embeddings.shape[1] | |
index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension)) | |
ids = np.arange(len(bookmarks)) | |
index.add_with_ids(embeddings, ids) | |
faiss_index = index | |
# Gradio interface setup | |
def process_bookmarks(file): | |
global bookmarks | |
file_content = file.read().decode('utf-8') | |
soup = BeautifulSoup(file_content, 'html.parser') | |
# Parse bookmarks | |
bookmarks = [ | |
{'url': link.get('href'), 'title': link.text, 'html_content': ''} | |
for link in soup.find_all('a') if link.get('href') | |
] | |
# Fetch URLs concurrently | |
with ThreadPoolExecutor() as executor: | |
executor.map(fetch_url_info, bookmarks) | |
# Process bookmarks with LLM | |
with ThreadPoolExecutor() as executor: | |
executor.map(generate_summary_and_assign_category, bookmarks) | |
# Build FAISS index | |
vectorize_and_index(bookmarks) | |
return bookmarks | |
# Build Gradio app | |
with gr.Blocks() as demo: | |
gr.Markdown("# Smart Bookmark Manager") | |
file_input = gr.File(label="Upload Bookmark File", type="binary") | |
submit_button = gr.Button("Process") | |
output = gr.Textbox(label="Output") | |
def handle_submit(file): | |
processed = process_bookmarks(file) | |
return "\n".join([f"{b['title']} - {b['category']}" for b in processed]) | |
submit_button.click(handle_submit, inputs=file_input, outputs=output) | |
demo.launch() | |