Spaces:
Sleeping
Sleeping
siddhartharya
commited on
Commit
•
3f6cb23
1
Parent(s):
813f784
Update app.py
Browse files
app.py
CHANGED
@@ -80,11 +80,11 @@ def extract_main_content(soup):
|
|
80 |
"""
|
81 |
if not soup:
|
82 |
return ""
|
83 |
-
|
84 |
# Remove script and style elements
|
85 |
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
|
86 |
element.decompose()
|
87 |
-
|
88 |
# First try to find content in main content areas
|
89 |
main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
|
90 |
if main_content_tags:
|
@@ -97,14 +97,14 @@ def extract_main_content(soup):
|
|
97 |
else:
|
98 |
# Fallback to body content
|
99 |
content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
|
100 |
-
|
101 |
# Clean up the text
|
102 |
content = ' '.join(content.split())
|
103 |
content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
|
104 |
content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
|
105 |
-
|
106 |
-
#
|
107 |
-
return content
|
108 |
|
109 |
def get_page_metadata(soup):
|
110 |
"""
|
@@ -115,15 +115,15 @@ def get_page_metadata(soup):
|
|
115 |
'description': '',
|
116 |
'keywords': ''
|
117 |
}
|
118 |
-
|
119 |
if not soup:
|
120 |
return metadata
|
121 |
-
|
122 |
# Get title
|
123 |
title_tag = soup.find('title')
|
124 |
if title_tag and title_tag.string:
|
125 |
metadata['title'] = title_tag.string.strip()
|
126 |
-
|
127 |
# Get meta description (try multiple variants)
|
128 |
meta_desc = (
|
129 |
soup.find('meta', attrs={'name': 'description'}) or
|
@@ -132,99 +132,108 @@ def get_page_metadata(soup):
|
|
132 |
)
|
133 |
if meta_desc:
|
134 |
metadata['description'] = meta_desc.get('content', '').strip()
|
135 |
-
|
136 |
# Get meta keywords
|
137 |
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
|
138 |
if meta_keywords:
|
139 |
metadata['keywords'] = meta_keywords.get('content', '').strip()
|
140 |
-
|
141 |
# Get OG title if main title is empty
|
142 |
if not metadata['title']:
|
143 |
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
144 |
if og_title:
|
145 |
metadata['title'] = og_title.get('content', '').strip()
|
146 |
-
|
147 |
return metadata
|
148 |
|
149 |
def generate_summary(bookmark):
|
150 |
"""
|
151 |
-
Generate a comprehensive summary for a bookmark using available content and LLM.
|
152 |
"""
|
153 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
154 |
-
|
155 |
try:
|
156 |
-
# Get the HTML soup object from the bookmark
|
157 |
soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
|
158 |
-
|
159 |
-
#
|
160 |
metadata = get_page_metadata(soup)
|
161 |
main_content = extract_main_content(soup)
|
162 |
-
|
163 |
-
#
|
164 |
-
|
165 |
-
|
166 |
-
available_content
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
logger.warning("No content available for summary generation")
|
178 |
-
bookmark['summary'] = bookmark.get('title', 'No summary available.')
|
179 |
-
return bookmark
|
180 |
-
|
181 |
-
prompt = f"""
|
182 |
-
Analyze and summarize this webpage based on the following information:
|
183 |
-
|
184 |
-
{' | '.join(available_content)}
|
185 |
-
|
186 |
-
Please provide a concise summary (2-3 sentences) focusing on:
|
187 |
-
1. The main purpose or topic of the page
|
188 |
-
2. Key information or features
|
189 |
-
3. Target audience or use case (if apparent)
|
190 |
-
|
191 |
-
Be factual and objective.
|
192 |
-
"""
|
193 |
-
|
194 |
-
response = openai.ChatCompletion.create(
|
195 |
-
model='llama3-8b-8192',
|
196 |
-
messages=[
|
197 |
-
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
198 |
-
{"role": "user", "content": prompt}
|
199 |
-
],
|
200 |
-
max_tokens=150,
|
201 |
-
temperature=0.5,
|
202 |
-
)
|
203 |
-
|
204 |
-
summary = response['choices'][0]['message']['content'].strip()
|
205 |
-
logger.info("Successfully generated LLM summary")
|
206 |
-
bookmark['summary'] = summary
|
207 |
-
return bookmark
|
208 |
-
|
209 |
-
except Exception as e:
|
210 |
-
logger.error(f"Error generating LLM summary: {e}")
|
211 |
-
# Fallback mechanisms in order of preference
|
212 |
-
if metadata['description']:
|
213 |
-
logger.info("Falling back to meta description")
|
214 |
-
bookmark['summary'] = metadata['description']
|
215 |
-
elif main_content:
|
216 |
-
logger.info("Falling back to truncated main content")
|
217 |
-
bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
|
218 |
-
elif metadata['title']:
|
219 |
-
logger.info("Falling back to title")
|
220 |
-
bookmark['summary'] = metadata['title']
|
221 |
-
else:
|
222 |
-
bookmark['summary'] = bookmark.get('title', 'No summary available.')
|
223 |
return bookmark
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
except Exception as e:
|
226 |
-
logger.error(f"Error
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
return bookmark
|
229 |
|
230 |
def parse_bookmarks(file_content):
|
@@ -313,67 +322,75 @@ async def process_bookmarks_async(bookmarks_list):
|
|
313 |
|
314 |
def assign_category(bookmark):
|
315 |
"""
|
316 |
-
Assign a category to a bookmark based on its
|
317 |
"""
|
318 |
if bookmark.get('dead_link'):
|
319 |
bookmark['category'] = 'Dead Link'
|
320 |
logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
|
321 |
return bookmark
|
322 |
|
323 |
-
summary = bookmark.get('summary', '')
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
category_keywords = {
|
328 |
-
"Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"],
|
329 |
-
"News and Media": ["news", "journalism", "media", "headlines", "breaking news"],
|
330 |
-
"Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"],
|
331 |
-
"Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"],
|
332 |
-
"Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"],
|
333 |
-
"Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"],
|
334 |
-
"Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"],
|
335 |
-
"Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"],
|
336 |
-
"Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"],
|
337 |
-
"Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"],
|
338 |
-
"Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"],
|
339 |
-
"Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"],
|
340 |
-
"Government and Politics": ["government", "politics", "policy", "election", "public service"],
|
341 |
-
"Business and Economy": ["business", "corporate", "industry", "economy", "markets"],
|
342 |
-
"Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"],
|
343 |
-
"Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"],
|
344 |
-
"Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"],
|
345 |
-
"Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"],
|
346 |
-
"Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"],
|
347 |
-
"Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"],
|
348 |
-
}
|
349 |
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
def vectorize_and_index(bookmarks_list):
|
365 |
"""
|
366 |
-
Create vector embeddings for bookmarks and build FAISS index.
|
367 |
"""
|
368 |
logger.info("Vectorizing summaries and building FAISS index")
|
369 |
try:
|
370 |
summaries = [bookmark['summary'] for bookmark in bookmarks_list]
|
371 |
embeddings = embedding_model.encode(summaries)
|
372 |
dimension = embeddings.shape[1]
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
377 |
except Exception as e:
|
378 |
logger.error(f"Error in vectorizing and indexing: {e}")
|
379 |
raise
|
@@ -400,6 +417,13 @@ def display_bookmarks():
|
|
400 |
card_style = "border: 2px solid var(--success-color);"
|
401 |
text_style = "color: var(--text-color);"
|
402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
card_html = f'''
|
404 |
<div class="card" style="{card_style}; padding: 10px; margin: 10px; border-radius: 5px;">
|
405 |
<div class="card-content">
|
@@ -421,7 +445,7 @@ def process_uploaded_file(file):
|
|
421 |
"""
|
422 |
global bookmarks, faiss_index
|
423 |
logger.info("Processing uploaded file")
|
424 |
-
|
425 |
if file is None:
|
426 |
logger.warning("No file uploaded")
|
427 |
return "Please upload a bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
|
@@ -442,6 +466,10 @@ def process_uploaded_file(file):
|
|
442 |
logger.warning("No bookmarks found in the uploaded file")
|
443 |
return "No bookmarks found in the uploaded file.", '', gr.update(choices=[]), display_bookmarks()
|
444 |
|
|
|
|
|
|
|
|
|
445 |
# Asynchronously fetch bookmark info
|
446 |
try:
|
447 |
asyncio.run(process_bookmarks_async(bookmarks))
|
@@ -455,48 +483,52 @@ def process_uploaded_file(file):
|
|
455 |
assign_category(bookmark)
|
456 |
|
457 |
try:
|
458 |
-
faiss_index
|
459 |
except Exception as e:
|
460 |
logger.error(f"Error building FAISS index: {e}")
|
461 |
return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
|
462 |
|
463 |
message = f"✅ Successfully processed {len(bookmarks)} bookmarks."
|
464 |
logger.info(message)
|
465 |
-
|
466 |
# Generate displays and updates
|
467 |
bookmark_html = display_bookmarks()
|
468 |
-
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
|
469 |
for i, bookmark in enumerate(bookmarks)]
|
470 |
|
471 |
return message, bookmark_html, gr.update(choices=choices), bookmark_html
|
472 |
|
473 |
def delete_selected_bookmarks(selected_indices):
|
474 |
"""
|
475 |
-
Delete selected bookmarks.
|
476 |
"""
|
477 |
global bookmarks, faiss_index
|
478 |
if not selected_indices:
|
479 |
return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
|
480 |
|
481 |
-
|
482 |
-
|
483 |
-
for
|
|
|
484 |
if 0 <= idx < len(bookmarks):
|
|
|
|
|
|
|
485 |
logger.info(f"Deleting bookmark at index {idx + 1}")
|
486 |
-
bookmarks.pop(idx)
|
487 |
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
|
|
|
|
|
|
492 |
|
493 |
message = "🗑️ Selected bookmarks deleted successfully."
|
494 |
logger.info(message)
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
for i, bookmark in enumerate(bookmarks)]
|
499 |
-
|
500 |
return message, gr.update(choices=choices), display_bookmarks()
|
501 |
|
502 |
def edit_selected_bookmarks_category(selected_indices, new_category):
|
@@ -516,11 +548,11 @@ def edit_selected_bookmarks_category(selected_indices, new_category):
|
|
516 |
|
517 |
message = "✏️ Category updated for selected bookmarks."
|
518 |
logger.info(message)
|
519 |
-
|
520 |
# Update choices and display
|
521 |
-
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
|
522 |
for i, bookmark in enumerate(bookmarks)]
|
523 |
-
|
524 |
return message, gr.update(choices=choices), display_bookmarks()
|
525 |
|
526 |
def export_bookmarks():
|
@@ -553,40 +585,52 @@ def export_bookmarks():
|
|
553 |
|
554 |
def chatbot_response(user_query):
|
555 |
"""
|
556 |
-
Generate chatbot response using
|
557 |
"""
|
558 |
-
if not
|
559 |
-
logger.warning("GROQ_API_KEY not set.")
|
560 |
-
return "⚠️ API key not set. Please set the GROQ_API_KEY environment variable."
|
561 |
-
|
562 |
-
if not bookmarks:
|
563 |
logger.warning("No bookmarks available for chatbot")
|
564 |
return "⚠️ No bookmarks available. Please upload and process your bookmarks first."
|
565 |
|
566 |
logger.info(f"Chatbot received query: {user_query}")
|
567 |
|
568 |
try:
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
|
|
|
|
|
|
573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
prompt = f"""
|
575 |
-
|
576 |
|
577 |
-
|
578 |
-
{user_query}
|
579 |
|
580 |
-
|
581 |
-
|
582 |
|
583 |
-
|
584 |
-
|
585 |
|
586 |
response = openai.ChatCompletion.create(
|
587 |
-
model='llama3-8b-8192',
|
588 |
messages=[
|
589 |
-
{"role": "system", "content": "You
|
590 |
{"role": "user", "content": prompt}
|
591 |
],
|
592 |
max_tokens=500,
|
@@ -680,7 +724,7 @@ def build_app():
|
|
680 |
delete_button = gr.Button("🗑️ Delete Selected")
|
681 |
edit_category_button = gr.Button("✏️ Edit Category")
|
682 |
export_button = gr.Button("💾 Export")
|
683 |
-
|
684 |
download_link = gr.HTML(label="📥 Download")
|
685 |
|
686 |
# Set up event handlers
|
@@ -720,4 +764,4 @@ def build_app():
|
|
720 |
print(f"Error building the app: {e}")
|
721 |
|
722 |
if __name__ == "__main__":
|
723 |
-
build_app()
|
|
|
80 |
"""
|
81 |
if not soup:
|
82 |
return ""
|
83 |
+
|
84 |
# Remove script and style elements
|
85 |
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
|
86 |
element.decompose()
|
87 |
+
|
88 |
# First try to find content in main content areas
|
89 |
main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
|
90 |
if main_content_tags:
|
|
|
97 |
else:
|
98 |
# Fallback to body content
|
99 |
content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
|
100 |
+
|
101 |
# Clean up the text
|
102 |
content = ' '.join(content.split())
|
103 |
content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
|
104 |
content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
|
105 |
+
|
106 |
+
# Return the content
|
107 |
+
return content
|
108 |
|
109 |
def get_page_metadata(soup):
|
110 |
"""
|
|
|
115 |
'description': '',
|
116 |
'keywords': ''
|
117 |
}
|
118 |
+
|
119 |
if not soup:
|
120 |
return metadata
|
121 |
+
|
122 |
# Get title
|
123 |
title_tag = soup.find('title')
|
124 |
if title_tag and title_tag.string:
|
125 |
metadata['title'] = title_tag.string.strip()
|
126 |
+
|
127 |
# Get meta description (try multiple variants)
|
128 |
meta_desc = (
|
129 |
soup.find('meta', attrs={'name': 'description'}) or
|
|
|
132 |
)
|
133 |
if meta_desc:
|
134 |
metadata['description'] = meta_desc.get('content', '').strip()
|
135 |
+
|
136 |
# Get meta keywords
|
137 |
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
|
138 |
if meta_keywords:
|
139 |
metadata['keywords'] = meta_keywords.get('content', '').strip()
|
140 |
+
|
141 |
# Get OG title if main title is empty
|
142 |
if not metadata['title']:
|
143 |
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
144 |
if og_title:
|
145 |
metadata['title'] = og_title.get('content', '').strip()
|
146 |
+
|
147 |
return metadata
|
148 |
|
149 |
def generate_summary(bookmark):
|
150 |
"""
|
151 |
+
Generate a comprehensive summary for a bookmark using available content and LLM via the Groq Cloud API.
|
152 |
"""
|
153 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
154 |
+
|
155 |
try:
|
156 |
+
# Get the HTML soup object from the bookmark
|
157 |
soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
|
158 |
+
|
159 |
+
# Extract metadata and main content
|
160 |
metadata = get_page_metadata(soup)
|
161 |
main_content = extract_main_content(soup)
|
162 |
+
|
163 |
+
# Prepare content for the prompt
|
164 |
+
available_content = []
|
165 |
+
if metadata['title']:
|
166 |
+
available_content.append(f"Title: {metadata['title']}")
|
167 |
+
if metadata['description']:
|
168 |
+
available_content.append(f"Description: {metadata['description']}")
|
169 |
+
if metadata['keywords']:
|
170 |
+
available_content.append(f"Keywords: {metadata['keywords']}")
|
171 |
+
if main_content:
|
172 |
+
available_content.append(f"Main Content: {main_content}")
|
173 |
+
|
174 |
+
if not available_content:
|
175 |
+
logger.warning("No content available for summary generation")
|
176 |
+
bookmark['summary'] = bookmark.get('title', 'No summary available.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
return bookmark
|
178 |
+
|
179 |
+
# Estimate token count and trim content if necessary
|
180 |
+
max_total_tokens = 8000 # Adjust based on model's maximum context length
|
181 |
+
prompt_tokens_estimate = len(' '.join(available_content).split()) + 200 # 200 tokens reserved for response
|
182 |
+
if prompt_tokens_estimate > max_total_tokens:
|
183 |
+
# Trim main content
|
184 |
+
allowable_content_tokens = max_total_tokens - 200 # Reserve 200 tokens for response
|
185 |
+
main_content_tokens = len(main_content.split())
|
186 |
+
if main_content_tokens > allowable_content_tokens:
|
187 |
+
main_content = ' '.join(main_content.split()[:allowable_content_tokens])
|
188 |
+
logger.info("Trimmed main content to fit within token limits.")
|
189 |
+
|
190 |
+
# Update available content
|
191 |
+
available_content[-1] = f"Main Content: {main_content}"
|
192 |
+
|
193 |
+
# Construct the prompt
|
194 |
+
prompt = f"""
|
195 |
+
Analyze and summarize the following webpage content:
|
196 |
+
|
197 |
+
{' '.join(available_content)}
|
198 |
+
|
199 |
+
Provide a concise summary (2-3 sentences) focusing on:
|
200 |
+
- The main purpose or topic of the page.
|
201 |
+
- Key information or features.
|
202 |
+
- Target audience or use case (if apparent).
|
203 |
+
|
204 |
+
Be factual and objective.
|
205 |
+
"""
|
206 |
+
|
207 |
+
# Call the LLM via Groq Cloud API
|
208 |
+
response = openai.ChatCompletion.create(
|
209 |
+
model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
|
210 |
+
messages=[
|
211 |
+
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
212 |
+
{"role": "user", "content": prompt}
|
213 |
+
],
|
214 |
+
max_tokens=200, # Adjust as necessary to accommodate longer summaries
|
215 |
+
temperature=0.5,
|
216 |
+
)
|
217 |
+
|
218 |
+
summary = response['choices'][0]['message']['content'].strip()
|
219 |
+
logger.info("Successfully generated LLM summary")
|
220 |
+
bookmark['summary'] = summary
|
221 |
+
return bookmark
|
222 |
+
|
223 |
except Exception as e:
|
224 |
+
logger.error(f"Error generating summary: {e}")
|
225 |
+
# Fallback mechanisms
|
226 |
+
if metadata['description']:
|
227 |
+
logger.info("Falling back to meta description")
|
228 |
+
bookmark['summary'] = metadata['description']
|
229 |
+
elif main_content:
|
230 |
+
logger.info("Falling back to main content")
|
231 |
+
bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
|
232 |
+
elif metadata['title']:
|
233 |
+
logger.info("Falling back to title")
|
234 |
+
bookmark['summary'] = metadata['title']
|
235 |
+
else:
|
236 |
+
bookmark['summary'] = 'No summary available.'
|
237 |
return bookmark
|
238 |
|
239 |
def parse_bookmarks(file_content):
|
|
|
322 |
|
323 |
def assign_category(bookmark):
|
324 |
"""
|
325 |
+
Assign a category to a bookmark using the LLM based on its summary via the Groq Cloud API.
|
326 |
"""
|
327 |
if bookmark.get('dead_link'):
|
328 |
bookmark['category'] = 'Dead Link'
|
329 |
logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
|
330 |
return bookmark
|
331 |
|
332 |
+
summary = bookmark.get('summary', '')
|
333 |
+
if not summary:
|
334 |
+
bookmark['category'] = 'Uncategorized'
|
335 |
+
return bookmark
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
|
337 |
+
# Prepare the prompt
|
338 |
+
categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
|
339 |
+
prompt = f"""
|
340 |
+
Based on the following summary, assign the most appropriate category from the list below.
|
341 |
+
|
342 |
+
Summary:
|
343 |
+
{summary}
|
344 |
+
|
345 |
+
Categories:
|
346 |
+
{categories_str}
|
347 |
+
|
348 |
+
Respond with only the category name.
|
349 |
+
"""
|
350 |
+
|
351 |
+
try:
|
352 |
+
response = openai.ChatCompletion.create(
|
353 |
+
model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
|
354 |
+
messages=[
|
355 |
+
{"role": "system", "content": "You categorize webpages based on their content."},
|
356 |
+
{"role": "user", "content": prompt}
|
357 |
+
],
|
358 |
+
max_tokens=10,
|
359 |
+
temperature=0,
|
360 |
+
)
|
361 |
+
|
362 |
+
category = response['choices'][0]['message']['content'].strip().strip('"')
|
363 |
+
|
364 |
+
# Validate the category
|
365 |
+
if category in CATEGORIES:
|
366 |
+
bookmark['category'] = category
|
367 |
+
logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
|
368 |
+
else:
|
369 |
+
bookmark['category'] = 'Uncategorized'
|
370 |
+
logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
|
371 |
+
|
372 |
+
return bookmark
|
373 |
+
|
374 |
+
except Exception as e:
|
375 |
+
logger.error(f"Error assigning category: {e}")
|
376 |
+
bookmark['category'] = 'Uncategorized'
|
377 |
+
return bookmark
|
378 |
|
379 |
def vectorize_and_index(bookmarks_list):
|
380 |
"""
|
381 |
+
Create vector embeddings for bookmarks and build FAISS index with ID mapping.
|
382 |
"""
|
383 |
logger.info("Vectorizing summaries and building FAISS index")
|
384 |
try:
|
385 |
summaries = [bookmark['summary'] for bookmark in bookmarks_list]
|
386 |
embeddings = embedding_model.encode(summaries)
|
387 |
dimension = embeddings.shape[1]
|
388 |
+
index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
|
389 |
+
# Assign unique IDs to each bookmark
|
390 |
+
ids = np.array([bookmark['id'] for bookmark in bookmarks_list], dtype=np.int64)
|
391 |
+
index.add_with_ids(np.array(embeddings).astype('float32'), ids)
|
392 |
+
logger.info("FAISS index built successfully with IDs")
|
393 |
+
return index
|
394 |
except Exception as e:
|
395 |
logger.error(f"Error in vectorizing and indexing: {e}")
|
396 |
raise
|
|
|
417 |
card_style = "border: 2px solid var(--success-color);"
|
418 |
text_style = "color: var(--text-color);"
|
419 |
|
420 |
+
# Escape HTML content to prevent XSS attacks
|
421 |
+
from html import escape
|
422 |
+
title = escape(title)
|
423 |
+
url = escape(url)
|
424 |
+
summary = escape(summary)
|
425 |
+
category = escape(category)
|
426 |
+
|
427 |
card_html = f'''
|
428 |
<div class="card" style="{card_style}; padding: 10px; margin: 10px; border-radius: 5px;">
|
429 |
<div class="card-content">
|
|
|
445 |
"""
|
446 |
global bookmarks, faiss_index
|
447 |
logger.info("Processing uploaded file")
|
448 |
+
|
449 |
if file is None:
|
450 |
logger.warning("No file uploaded")
|
451 |
return "Please upload a bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
|
|
|
466 |
logger.warning("No bookmarks found in the uploaded file")
|
467 |
return "No bookmarks found in the uploaded file.", '', gr.update(choices=[]), display_bookmarks()
|
468 |
|
469 |
+
# Assign unique IDs to bookmarks
|
470 |
+
for idx, bookmark in enumerate(bookmarks):
|
471 |
+
bookmark['id'] = idx
|
472 |
+
|
473 |
# Asynchronously fetch bookmark info
|
474 |
try:
|
475 |
asyncio.run(process_bookmarks_async(bookmarks))
|
|
|
483 |
assign_category(bookmark)
|
484 |
|
485 |
try:
|
486 |
+
faiss_index = vectorize_and_index(bookmarks)
|
487 |
except Exception as e:
|
488 |
logger.error(f"Error building FAISS index: {e}")
|
489 |
return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
|
490 |
|
491 |
message = f"✅ Successfully processed {len(bookmarks)} bookmarks."
|
492 |
logger.info(message)
|
493 |
+
|
494 |
# Generate displays and updates
|
495 |
bookmark_html = display_bookmarks()
|
496 |
+
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
|
497 |
for i, bookmark in enumerate(bookmarks)]
|
498 |
|
499 |
return message, bookmark_html, gr.update(choices=choices), bookmark_html
|
500 |
|
501 |
def delete_selected_bookmarks(selected_indices):
|
502 |
"""
|
503 |
+
Delete selected bookmarks and remove their vectors from the FAISS index.
|
504 |
"""
|
505 |
global bookmarks, faiss_index
|
506 |
if not selected_indices:
|
507 |
return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
|
508 |
|
509 |
+
ids_to_delete = []
|
510 |
+
indices_to_delete = []
|
511 |
+
for s in selected_indices:
|
512 |
+
idx = int(s.split('.')[0]) - 1
|
513 |
if 0 <= idx < len(bookmarks):
|
514 |
+
bookmark_id = bookmarks[idx]['id']
|
515 |
+
ids_to_delete.append(bookmark_id)
|
516 |
+
indices_to_delete.append(idx)
|
517 |
logger.info(f"Deleting bookmark at index {idx + 1}")
|
|
|
518 |
|
519 |
+
# Remove vectors from FAISS index
|
520 |
+
if faiss_index is not None and ids_to_delete:
|
521 |
+
faiss_index.remove_ids(np.array(ids_to_delete, dtype=np.int64))
|
522 |
+
|
523 |
+
# Remove bookmarks from the list (reverse order to avoid index shifting)
|
524 |
+
for idx in sorted(indices_to_delete, reverse=True):
|
525 |
+
bookmarks.pop(idx)
|
526 |
|
527 |
message = "🗑️ Selected bookmarks deleted successfully."
|
528 |
logger.info(message)
|
529 |
+
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
|
530 |
+
for i, bookmark in enumerate(bookmarks)]
|
531 |
+
|
|
|
|
|
532 |
return message, gr.update(choices=choices), display_bookmarks()
|
533 |
|
534 |
def edit_selected_bookmarks_category(selected_indices, new_category):
|
|
|
548 |
|
549 |
message = "✏️ Category updated for selected bookmarks."
|
550 |
logger.info(message)
|
551 |
+
|
552 |
# Update choices and display
|
553 |
+
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
|
554 |
for i, bookmark in enumerate(bookmarks)]
|
555 |
+
|
556 |
return message, gr.update(choices=choices), display_bookmarks()
|
557 |
|
558 |
def export_bookmarks():
|
|
|
585 |
|
586 |
def chatbot_response(user_query):
|
587 |
"""
|
588 |
+
Generate chatbot response using the FAISS index and embeddings.
|
589 |
"""
|
590 |
+
if not bookmarks or faiss_index is None:
|
|
|
|
|
|
|
|
|
591 |
logger.warning("No bookmarks available for chatbot")
|
592 |
return "⚠️ No bookmarks available. Please upload and process your bookmarks first."
|
593 |
|
594 |
logger.info(f"Chatbot received query: {user_query}")
|
595 |
|
596 |
try:
|
597 |
+
# Encode the user query
|
598 |
+
query_vector = embedding_model.encode([user_query]).astype('float32')
|
599 |
+
|
600 |
+
# Search the FAISS index
|
601 |
+
k = 5 # Number of results to return
|
602 |
+
distances, ids = faiss_index.search(query_vector, k)
|
603 |
+
ids = ids.flatten()
|
604 |
|
605 |
+
# Retrieve the bookmarks
|
606 |
+
id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
|
607 |
+
matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
|
608 |
+
|
609 |
+
if not matching_bookmarks:
|
610 |
+
return "No relevant bookmarks found for your query."
|
611 |
+
|
612 |
+
# Format the response
|
613 |
+
bookmarks_info = "\n".join([
|
614 |
+
f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}"
|
615 |
+
for bookmark in matching_bookmarks
|
616 |
+
])
|
617 |
+
|
618 |
+
# Use the LLM via Groq Cloud API to generate a response
|
619 |
prompt = f"""
|
620 |
+
A user asked: "{user_query}"
|
621 |
|
622 |
+
Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks.
|
|
|
623 |
|
624 |
+
Bookmarks:
|
625 |
+
{bookmarks_info}
|
626 |
|
627 |
+
Provide a concise and helpful response.
|
628 |
+
"""
|
629 |
|
630 |
response = openai.ChatCompletion.create(
|
631 |
+
model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
|
632 |
messages=[
|
633 |
+
{"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
|
634 |
{"role": "user", "content": prompt}
|
635 |
],
|
636 |
max_tokens=500,
|
|
|
724 |
delete_button = gr.Button("🗑️ Delete Selected")
|
725 |
edit_category_button = gr.Button("✏️ Edit Category")
|
726 |
export_button = gr.Button("💾 Export")
|
727 |
+
|
728 |
download_link = gr.HTML(label="📥 Download")
|
729 |
|
730 |
# Set up event handlers
|
|
|
764 |
print(f"Error building the app: {e}")
|
765 |
|
766 |
if __name__ == "__main__":
|
767 |
+
build_app()
|