siddhartharya's picture
Update app.py
05de921 verified
raw
history blame
5.4 kB
import os
import time
import threading
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gradio as gr
from concurrent.futures import ThreadPoolExecutor
import logging
# Suppress warnings from urllib3
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Environment variable keys for API access
GROQ_API_KEY_BASIC = os.getenv('GROQ_API_KEY_BASIC')
GROQ_API_KEY_ADVANCED = os.getenv('GROQ_API_KEY_ADVANCED')
# LLM Models
MODEL_BASIC = 'llama-3.1-8b-instant'
MODEL_ADVANCED = 'llama-3.1-70b-versatile'
# Verify API keys
if not GROQ_API_KEY_BASIC or not GROQ_API_KEY_ADVANCED:
logger.error("Both GROQ_API_KEY_BASIC and GROQ_API_KEY_ADVANCED must be set.")
exit()
# Embedding model and FAISS index initialization
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
faiss_index = None
bookmarks = []
# Define categories
CATEGORIES = [
"Social Media", "News and Media", "Education and Learning", "Entertainment",
"Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness",
"Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture",
"Government and Politics", "Business and Economy", "Science and Research",
"Personal Blogs and Journals", "Job Search and Careers", "Music and Audio",
"Videos and Movies", "Reference and Knowledge Bases", "Dead Link", "Uncategorized"
]
# Task routing logic
def select_model_for_task(content_length):
"""Choose LLM model based on task complexity."""
if content_length < 500: # Simple tasks
return GROQ_API_KEY_BASIC, MODEL_BASIC
else: # Complex tasks
return GROQ_API_KEY_ADVANCED, MODEL_ADVANCED
# Fetch URL info function
def fetch_url_info(bookmark):
try:
response = requests.get(bookmark['url'], timeout=10, verify=False)
bookmark['html_content'] = response.text
bookmark['status_code'] = response.status_code
except Exception as e:
logger.error(f"Failed to fetch URL info for {bookmark['url']}: {e}")
bookmark['html_content'] = ''
bookmark['status_code'] = 'Error'
# Generate summary and assign category
def generate_summary_and_assign_category(bookmark):
content_length = len(bookmark.get('html_content', ''))
api_key, model_name = select_model_for_task(content_length)
# Prepare the prompt
prompt = f"""
You are an assistant. Summarize the following webpage content:
{bookmark.get('html_content', '')}
Assign one category from this list: {', '.join(CATEGORIES)}.
Respond in the format:
Summary: [Your summary]
Category: [One category]
"""
try:
response = requests.post(
f"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {api_key}"},
json={
"model": model_name,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 150,
"temperature": 0.7,
},
)
result = response.json()
content = result['choices'][0]['message']['content']
# Extract summary and category
summary_start = content.find("Summary:")
category_start = content.find("Category:")
bookmark['summary'] = content[summary_start + 9:category_start].strip()
bookmark['category'] = content[category_start + 9:].strip()
except Exception as e:
logger.error(f"Error processing LLM response for {bookmark['url']}: {e}")
bookmark['summary'] = 'No summary available.'
bookmark['category'] = 'Uncategorized'
# Vectorize summaries and build FAISS index
def vectorize_and_index(bookmarks):
global faiss_index
summaries = [b['summary'] for b in bookmarks]
embeddings = embedding_model.encode(summaries)
dimension = embeddings.shape[1]
index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
ids = np.arange(len(bookmarks))
index.add_with_ids(embeddings, ids)
faiss_index = index
# Gradio interface setup
def process_bookmarks(file):
global bookmarks
file_content = file.read().decode('utf-8')
soup = BeautifulSoup(file_content, 'html.parser')
# Parse bookmarks
bookmarks = [
{'url': link.get('href'), 'title': link.text, 'html_content': ''}
for link in soup.find_all('a') if link.get('href')
]
# Fetch URLs concurrently
with ThreadPoolExecutor() as executor:
executor.map(fetch_url_info, bookmarks)
# Process bookmarks with LLM
with ThreadPoolExecutor() as executor:
executor.map(generate_summary_and_assign_category, bookmarks)
# Build FAISS index
vectorize_and_index(bookmarks)
return bookmarks
# Build Gradio app
with gr.Blocks() as demo:
gr.Markdown("# Smart Bookmark Manager")
file_input = gr.File(label="Upload Bookmark File", type="binary")
submit_button = gr.Button("Process")
output = gr.Textbox(label="Output")
def handle_submit(file):
processed = process_bookmarks(file)
return "\n".join([f"{b['title']} - {b['category']}" for b in processed])
submit_button.click(handle_submit, inputs=file_input, outputs=output)
demo.launch()