Spaces:

siddhartharya
/

Bookmark-Manager

Running

App Files Files Community

Bookmark-Manager / app.py

siddhartharya

Update app.py

05de921 verified 7 months ago

raw

history blame

5.4 kB

	import os
	import time
	import threading
	import requests
	from bs4 import BeautifulSoup
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import gradio as gr
	from concurrent.futures import ThreadPoolExecutor
	import logging

	# Suppress warnings from urllib3
	import urllib3
	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

	# Logging setup
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Environment variable keys for API access
	GROQ_API_KEY_BASIC = os.getenv('GROQ_API_KEY_BASIC')
	GROQ_API_KEY_ADVANCED = os.getenv('GROQ_API_KEY_ADVANCED')

	# LLM Models
	MODEL_BASIC = 'llama-3.1-8b-instant'
	MODEL_ADVANCED = 'llama-3.1-70b-versatile'

	# Verify API keys
	if not GROQ_API_KEY_BASIC or not GROQ_API_KEY_ADVANCED:
	logger.error("Both GROQ_API_KEY_BASIC and GROQ_API_KEY_ADVANCED must be set.")
	exit()

	# Embedding model and FAISS index initialization
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
	faiss_index = None
	bookmarks = []

	# Define categories
	CATEGORIES = [
	"Social Media", "News and Media", "Education and Learning", "Entertainment",
	"Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness",
	"Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture",
	"Government and Politics", "Business and Economy", "Science and Research",
	"Personal Blogs and Journals", "Job Search and Careers", "Music and Audio",
	"Videos and Movies", "Reference and Knowledge Bases", "Dead Link", "Uncategorized"
	]

	# Task routing logic
	def select_model_for_task(content_length):
	"""Choose LLM model based on task complexity."""
	if content_length < 500: # Simple tasks
	return GROQ_API_KEY_BASIC, MODEL_BASIC
	else: # Complex tasks
	return GROQ_API_KEY_ADVANCED, MODEL_ADVANCED

	# Fetch URL info function
	def fetch_url_info(bookmark):
	try:
	response = requests.get(bookmark['url'], timeout=10, verify=False)
	bookmark['html_content'] = response.text
	bookmark['status_code'] = response.status_code
	except Exception as e:
	logger.error(f"Failed to fetch URL info for {bookmark['url']}: {e}")
	bookmark['html_content'] = ''
	bookmark['status_code'] = 'Error'

	# Generate summary and assign category
	def generate_summary_and_assign_category(bookmark):
	content_length = len(bookmark.get('html_content', ''))
	api_key, model_name = select_model_for_task(content_length)

	# Prepare the prompt
	prompt = f"""
	You are an assistant. Summarize the following webpage content:
	{bookmark.get('html_content', '')}

	Assign one category from this list: {', '.join(CATEGORIES)}.

	Respond in the format:
	Summary: [Your summary]
	Category: [One category]
	"""

	try:
	response = requests.post(
	f"https://api.openai.com/v1/chat/completions",
	headers={"Authorization": f"Bearer {api_key}"},
	json={
	"model": model_name,
	"messages": [{"role": "user", "content": prompt}],
	"max_tokens": 150,
	"temperature": 0.7,
	},
	)
	result = response.json()
	content = result['choices'][0]['message']['content']

	# Extract summary and category
	summary_start = content.find("Summary:")
	category_start = content.find("Category:")
	bookmark['summary'] = content[summary_start + 9:category_start].strip()
	bookmark['category'] = content[category_start + 9:].strip()
	except Exception as e:
	logger.error(f"Error processing LLM response for {bookmark['url']}: {e}")
	bookmark['summary'] = 'No summary available.'
	bookmark['category'] = 'Uncategorized'

	# Vectorize summaries and build FAISS index
	def vectorize_and_index(bookmarks):
	global faiss_index
	summaries = [b['summary'] for b in bookmarks]
	embeddings = embedding_model.encode(summaries)
	dimension = embeddings.shape[1]
	index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
	ids = np.arange(len(bookmarks))
	index.add_with_ids(embeddings, ids)
	faiss_index = index

	# Gradio interface setup
	def process_bookmarks(file):
	global bookmarks
	file_content = file.read().decode('utf-8')
	soup = BeautifulSoup(file_content, 'html.parser')

	# Parse bookmarks
	bookmarks = [
	{'url': link.get('href'), 'title': link.text, 'html_content': ''}
	for link in soup.find_all('a') if link.get('href')
	]

	# Fetch URLs concurrently
	with ThreadPoolExecutor() as executor:
	executor.map(fetch_url_info, bookmarks)

	# Process bookmarks with LLM
	with ThreadPoolExecutor() as executor:
	executor.map(generate_summary_and_assign_category, bookmarks)

	# Build FAISS index
	vectorize_and_index(bookmarks)

	return bookmarks

	# Build Gradio app
	with gr.Blocks() as demo:
	gr.Markdown("# Smart Bookmark Manager")
	file_input = gr.File(label="Upload Bookmark File", type="binary")
	submit_button = gr.Button("Process")
	output = gr.Textbox(label="Output")

	def handle_submit(file):
	processed = process_bookmarks(file)
	return "\n".join([f"{b['title']} - {b['category']}" for b in processed])

	submit_button.click(handle_submit, inputs=file_input, outputs=output)

	demo.launch()