siddhartharya commited on
Commit
3f6cb23
1 Parent(s): 813f784

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -168
app.py CHANGED
@@ -80,11 +80,11 @@ def extract_main_content(soup):
80
  """
81
  if not soup:
82
  return ""
83
-
84
  # Remove script and style elements
85
  for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
86
  element.decompose()
87
-
88
  # First try to find content in main content areas
89
  main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
90
  if main_content_tags:
@@ -97,14 +97,14 @@ def extract_main_content(soup):
97
  else:
98
  # Fallback to body content
99
  content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
100
-
101
  # Clean up the text
102
  content = ' '.join(content.split())
103
  content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
104
  content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
105
-
106
- # Limit content length to avoid token limits (adjust as needed)
107
- return content[:5000]
108
 
109
  def get_page_metadata(soup):
110
  """
@@ -115,15 +115,15 @@ def get_page_metadata(soup):
115
  'description': '',
116
  'keywords': ''
117
  }
118
-
119
  if not soup:
120
  return metadata
121
-
122
  # Get title
123
  title_tag = soup.find('title')
124
  if title_tag and title_tag.string:
125
  metadata['title'] = title_tag.string.strip()
126
-
127
  # Get meta description (try multiple variants)
128
  meta_desc = (
129
  soup.find('meta', attrs={'name': 'description'}) or
@@ -132,99 +132,108 @@ def get_page_metadata(soup):
132
  )
133
  if meta_desc:
134
  metadata['description'] = meta_desc.get('content', '').strip()
135
-
136
  # Get meta keywords
137
  meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
138
  if meta_keywords:
139
  metadata['keywords'] = meta_keywords.get('content', '').strip()
140
-
141
  # Get OG title if main title is empty
142
  if not metadata['title']:
143
  og_title = soup.find('meta', attrs={'property': 'og:title'})
144
  if og_title:
145
  metadata['title'] = og_title.get('content', '').strip()
146
-
147
  return metadata
148
 
149
  def generate_summary(bookmark):
150
  """
151
- Generate a comprehensive summary for a bookmark using available content and LLM.
152
  """
153
  logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
154
-
155
  try:
156
- # Get the HTML soup object from the bookmark if it exists
157
  soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
158
-
159
- # Step 1: Extract all available information
160
  metadata = get_page_metadata(soup)
161
  main_content = extract_main_content(soup)
162
-
163
- # Step 2: Generate summary using LLM with all available content
164
- try:
165
- # Prepare comprehensive context for LLM
166
- available_content = []
167
- if metadata['title']:
168
- available_content.append(f"Title: {metadata['title']}")
169
- if metadata['description']:
170
- available_content.append(f"Description: {metadata['description']}")
171
- if metadata['keywords']:
172
- available_content.append(f"Keywords: {metadata['keywords']}")
173
- if main_content:
174
- available_content.append(f"Main Content: {main_content}")
175
-
176
- if not available_content:
177
- logger.warning("No content available for summary generation")
178
- bookmark['summary'] = bookmark.get('title', 'No summary available.')
179
- return bookmark
180
-
181
- prompt = f"""
182
- Analyze and summarize this webpage based on the following information:
183
-
184
- {' | '.join(available_content)}
185
-
186
- Please provide a concise summary (2-3 sentences) focusing on:
187
- 1. The main purpose or topic of the page
188
- 2. Key information or features
189
- 3. Target audience or use case (if apparent)
190
-
191
- Be factual and objective.
192
- """
193
-
194
- response = openai.ChatCompletion.create(
195
- model='llama3-8b-8192',
196
- messages=[
197
- {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
198
- {"role": "user", "content": prompt}
199
- ],
200
- max_tokens=150,
201
- temperature=0.5,
202
- )
203
-
204
- summary = response['choices'][0]['message']['content'].strip()
205
- logger.info("Successfully generated LLM summary")
206
- bookmark['summary'] = summary
207
- return bookmark
208
-
209
- except Exception as e:
210
- logger.error(f"Error generating LLM summary: {e}")
211
- # Fallback mechanisms in order of preference
212
- if metadata['description']:
213
- logger.info("Falling back to meta description")
214
- bookmark['summary'] = metadata['description']
215
- elif main_content:
216
- logger.info("Falling back to truncated main content")
217
- bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
218
- elif metadata['title']:
219
- logger.info("Falling back to title")
220
- bookmark['summary'] = metadata['title']
221
- else:
222
- bookmark['summary'] = bookmark.get('title', 'No summary available.')
223
  return bookmark
224
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  except Exception as e:
226
- logger.error(f"Error in generate_summary: {e}")
227
- bookmark['summary'] = bookmark.get('title', 'No summary available.')
 
 
 
 
 
 
 
 
 
 
 
228
  return bookmark
229
 
230
  def parse_bookmarks(file_content):
@@ -313,67 +322,75 @@ async def process_bookmarks_async(bookmarks_list):
313
 
314
  def assign_category(bookmark):
315
  """
316
- Assign a category to a bookmark based on its content.
317
  """
318
  if bookmark.get('dead_link'):
319
  bookmark['category'] = 'Dead Link'
320
  logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
321
  return bookmark
322
 
323
- summary = bookmark.get('summary', '').lower()
324
- assigned_category = 'Uncategorized'
325
-
326
- # Keywords associated with each category
327
- category_keywords = {
328
- "Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"],
329
- "News and Media": ["news", "journalism", "media", "headlines", "breaking news"],
330
- "Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"],
331
- "Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"],
332
- "Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"],
333
- "Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"],
334
- "Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"],
335
- "Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"],
336
- "Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"],
337
- "Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"],
338
- "Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"],
339
- "Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"],
340
- "Government and Politics": ["government", "politics", "policy", "election", "public service"],
341
- "Business and Economy": ["business", "corporate", "industry", "economy", "markets"],
342
- "Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"],
343
- "Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"],
344
- "Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"],
345
- "Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"],
346
- "Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"],
347
- "Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"],
348
- }
349
 
350
- for category, keywords in category_keywords.items():
351
- for keyword in keywords:
352
- if re.search(r'\b' + re.escape(keyword) + r'\b', summary):
353
- assigned_category = category
354
- logger.info(f"Assigned category '{assigned_category}' to bookmark: {bookmark.get('url')}")
355
- break
356
- if assigned_category != 'Uncategorized':
357
- break
358
-
359
- bookmark['category'] = assigned_category
360
- if assigned_category == 'Uncategorized':
361
- logger.info(f"No matching category found for bookmark: {bookmark.get('url')}")
362
- return bookmark
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  def vectorize_and_index(bookmarks_list):
365
  """
366
- Create vector embeddings for bookmarks and build FAISS index.
367
  """
368
  logger.info("Vectorizing summaries and building FAISS index")
369
  try:
370
  summaries = [bookmark['summary'] for bookmark in bookmarks_list]
371
  embeddings = embedding_model.encode(summaries)
372
  dimension = embeddings.shape[1]
373
- faiss_idx = faiss.IndexFlatL2(dimension)
374
- faiss_idx.add(np.array(embeddings))
375
- logger.info("FAISS index built successfully")
376
- return faiss_idx, embeddings
 
 
377
  except Exception as e:
378
  logger.error(f"Error in vectorizing and indexing: {e}")
379
  raise
@@ -400,6 +417,13 @@ def display_bookmarks():
400
  card_style = "border: 2px solid var(--success-color);"
401
  text_style = "color: var(--text-color);"
402
 
 
 
 
 
 
 
 
403
  card_html = f'''
404
  <div class="card" style="{card_style}; padding: 10px; margin: 10px; border-radius: 5px;">
405
  <div class="card-content">
@@ -421,7 +445,7 @@ def process_uploaded_file(file):
421
  """
422
  global bookmarks, faiss_index
423
  logger.info("Processing uploaded file")
424
-
425
  if file is None:
426
  logger.warning("No file uploaded")
427
  return "Please upload a bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
@@ -442,6 +466,10 @@ def process_uploaded_file(file):
442
  logger.warning("No bookmarks found in the uploaded file")
443
  return "No bookmarks found in the uploaded file.", '', gr.update(choices=[]), display_bookmarks()
444
 
 
 
 
 
445
  # Asynchronously fetch bookmark info
446
  try:
447
  asyncio.run(process_bookmarks_async(bookmarks))
@@ -455,48 +483,52 @@ def process_uploaded_file(file):
455
  assign_category(bookmark)
456
 
457
  try:
458
- faiss_index, embeddings = vectorize_and_index(bookmarks)
459
  except Exception as e:
460
  logger.error(f"Error building FAISS index: {e}")
461
  return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
462
 
463
  message = f"✅ Successfully processed {len(bookmarks)} bookmarks."
464
  logger.info(message)
465
-
466
  # Generate displays and updates
467
  bookmark_html = display_bookmarks()
468
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
469
  for i, bookmark in enumerate(bookmarks)]
470
 
471
  return message, bookmark_html, gr.update(choices=choices), bookmark_html
472
 
473
  def delete_selected_bookmarks(selected_indices):
474
  """
475
- Delete selected bookmarks.
476
  """
477
  global bookmarks, faiss_index
478
  if not selected_indices:
479
  return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
480
 
481
- indices = [int(s.split('.')[0])-1 for s in selected_indices]
482
- indices = sorted(indices, reverse=True)
483
- for idx in indices:
 
484
  if 0 <= idx < len(bookmarks):
 
 
 
485
  logger.info(f"Deleting bookmark at index {idx + 1}")
486
- bookmarks.pop(idx)
487
 
488
- if bookmarks:
489
- faiss_index, embeddings = vectorize_and_index(bookmarks)
490
- else:
491
- faiss_index = None
 
 
 
492
 
493
  message = "🗑️ Selected bookmarks deleted successfully."
494
  logger.info(message)
495
-
496
- # Update choices and display
497
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
498
- for i, bookmark in enumerate(bookmarks)]
499
-
500
  return message, gr.update(choices=choices), display_bookmarks()
501
 
502
  def edit_selected_bookmarks_category(selected_indices, new_category):
@@ -516,11 +548,11 @@ def edit_selected_bookmarks_category(selected_indices, new_category):
516
 
517
  message = "✏️ Category updated for selected bookmarks."
518
  logger.info(message)
519
-
520
  # Update choices and display
521
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
522
  for i, bookmark in enumerate(bookmarks)]
523
-
524
  return message, gr.update(choices=choices), display_bookmarks()
525
 
526
  def export_bookmarks():
@@ -553,40 +585,52 @@ def export_bookmarks():
553
 
554
  def chatbot_response(user_query):
555
  """
556
- Generate chatbot response using Groq Cloud API.
557
  """
558
- if not GROQ_API_KEY:
559
- logger.warning("GROQ_API_KEY not set.")
560
- return "⚠️ API key not set. Please set the GROQ_API_KEY environment variable."
561
-
562
- if not bookmarks:
563
  logger.warning("No bookmarks available for chatbot")
564
  return "⚠️ No bookmarks available. Please upload and process your bookmarks first."
565
 
566
  logger.info(f"Chatbot received query: {user_query}")
567
 
568
  try:
569
- max_bookmarks = 50
570
- bookmark_data = ""
571
- for idx, bookmark in enumerate(bookmarks[:max_bookmarks]):
572
- bookmark_data += f"{idx+1}. Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}\n\n"
 
 
 
573
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  prompt = f"""
575
- You are an assistant that helps users find relevant bookmarks from their collection based on their queries.
576
 
577
- User Query:
578
- {user_query}
579
 
580
- Bookmarks:
581
- {bookmark_data}
582
 
583
- Please identify the most relevant bookmarks that match the user's query. Provide a concise list including the index, title, URL, and a brief summary.
584
- """
585
 
586
  response = openai.ChatCompletion.create(
587
- model='llama3-8b-8192',
588
  messages=[
589
- {"role": "system", "content": "You help users find relevant bookmarks based on their queries."},
590
  {"role": "user", "content": prompt}
591
  ],
592
  max_tokens=500,
@@ -680,7 +724,7 @@ def build_app():
680
  delete_button = gr.Button("🗑️ Delete Selected")
681
  edit_category_button = gr.Button("✏️ Edit Category")
682
  export_button = gr.Button("💾 Export")
683
-
684
  download_link = gr.HTML(label="📥 Download")
685
 
686
  # Set up event handlers
@@ -720,4 +764,4 @@ def build_app():
720
  print(f"Error building the app: {e}")
721
 
722
  if __name__ == "__main__":
723
- build_app()
 
80
  """
81
  if not soup:
82
  return ""
83
+
84
  # Remove script and style elements
85
  for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
86
  element.decompose()
87
+
88
  # First try to find content in main content areas
89
  main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
90
  if main_content_tags:
 
97
  else:
98
  # Fallback to body content
99
  content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
100
+
101
  # Clean up the text
102
  content = ' '.join(content.split())
103
  content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
104
  content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
105
+
106
+ # Return the content
107
+ return content
108
 
109
  def get_page_metadata(soup):
110
  """
 
115
  'description': '',
116
  'keywords': ''
117
  }
118
+
119
  if not soup:
120
  return metadata
121
+
122
  # Get title
123
  title_tag = soup.find('title')
124
  if title_tag and title_tag.string:
125
  metadata['title'] = title_tag.string.strip()
126
+
127
  # Get meta description (try multiple variants)
128
  meta_desc = (
129
  soup.find('meta', attrs={'name': 'description'}) or
 
132
  )
133
  if meta_desc:
134
  metadata['description'] = meta_desc.get('content', '').strip()
135
+
136
  # Get meta keywords
137
  meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
138
  if meta_keywords:
139
  metadata['keywords'] = meta_keywords.get('content', '').strip()
140
+
141
  # Get OG title if main title is empty
142
  if not metadata['title']:
143
  og_title = soup.find('meta', attrs={'property': 'og:title'})
144
  if og_title:
145
  metadata['title'] = og_title.get('content', '').strip()
146
+
147
  return metadata
148
 
149
  def generate_summary(bookmark):
150
  """
151
+ Generate a comprehensive summary for a bookmark using available content and LLM via the Groq Cloud API.
152
  """
153
  logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
154
+
155
  try:
156
+ # Get the HTML soup object from the bookmark
157
  soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
158
+
159
+ # Extract metadata and main content
160
  metadata = get_page_metadata(soup)
161
  main_content = extract_main_content(soup)
162
+
163
+ # Prepare content for the prompt
164
+ available_content = []
165
+ if metadata['title']:
166
+ available_content.append(f"Title: {metadata['title']}")
167
+ if metadata['description']:
168
+ available_content.append(f"Description: {metadata['description']}")
169
+ if metadata['keywords']:
170
+ available_content.append(f"Keywords: {metadata['keywords']}")
171
+ if main_content:
172
+ available_content.append(f"Main Content: {main_content}")
173
+
174
+ if not available_content:
175
+ logger.warning("No content available for summary generation")
176
+ bookmark['summary'] = bookmark.get('title', 'No summary available.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  return bookmark
178
+
179
+ # Estimate token count and trim content if necessary
180
+ max_total_tokens = 8000 # Adjust based on model's maximum context length
181
+ prompt_tokens_estimate = len(' '.join(available_content).split()) + 200 # 200 tokens reserved for response
182
+ if prompt_tokens_estimate > max_total_tokens:
183
+ # Trim main content
184
+ allowable_content_tokens = max_total_tokens - 200 # Reserve 200 tokens for response
185
+ main_content_tokens = len(main_content.split())
186
+ if main_content_tokens > allowable_content_tokens:
187
+ main_content = ' '.join(main_content.split()[:allowable_content_tokens])
188
+ logger.info("Trimmed main content to fit within token limits.")
189
+
190
+ # Update available content
191
+ available_content[-1] = f"Main Content: {main_content}"
192
+
193
+ # Construct the prompt
194
+ prompt = f"""
195
+ Analyze and summarize the following webpage content:
196
+
197
+ {' '.join(available_content)}
198
+
199
+ Provide a concise summary (2-3 sentences) focusing on:
200
+ - The main purpose or topic of the page.
201
+ - Key information or features.
202
+ - Target audience or use case (if apparent).
203
+
204
+ Be factual and objective.
205
+ """
206
+
207
+ # Call the LLM via Groq Cloud API
208
+ response = openai.ChatCompletion.create(
209
+ model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
210
+ messages=[
211
+ {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
212
+ {"role": "user", "content": prompt}
213
+ ],
214
+ max_tokens=200, # Adjust as necessary to accommodate longer summaries
215
+ temperature=0.5,
216
+ )
217
+
218
+ summary = response['choices'][0]['message']['content'].strip()
219
+ logger.info("Successfully generated LLM summary")
220
+ bookmark['summary'] = summary
221
+ return bookmark
222
+
223
  except Exception as e:
224
+ logger.error(f"Error generating summary: {e}")
225
+ # Fallback mechanisms
226
+ if metadata['description']:
227
+ logger.info("Falling back to meta description")
228
+ bookmark['summary'] = metadata['description']
229
+ elif main_content:
230
+ logger.info("Falling back to main content")
231
+ bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
232
+ elif metadata['title']:
233
+ logger.info("Falling back to title")
234
+ bookmark['summary'] = metadata['title']
235
+ else:
236
+ bookmark['summary'] = 'No summary available.'
237
  return bookmark
238
 
239
  def parse_bookmarks(file_content):
 
322
 
323
  def assign_category(bookmark):
324
  """
325
+ Assign a category to a bookmark using the LLM based on its summary via the Groq Cloud API.
326
  """
327
  if bookmark.get('dead_link'):
328
  bookmark['category'] = 'Dead Link'
329
  logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
330
  return bookmark
331
 
332
+ summary = bookmark.get('summary', '')
333
+ if not summary:
334
+ bookmark['category'] = 'Uncategorized'
335
+ return bookmark
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
+ # Prepare the prompt
338
+ categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
339
+ prompt = f"""
340
+ Based on the following summary, assign the most appropriate category from the list below.
341
+
342
+ Summary:
343
+ {summary}
344
+
345
+ Categories:
346
+ {categories_str}
347
+
348
+ Respond with only the category name.
349
+ """
350
+
351
+ try:
352
+ response = openai.ChatCompletion.create(
353
+ model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
354
+ messages=[
355
+ {"role": "system", "content": "You categorize webpages based on their content."},
356
+ {"role": "user", "content": prompt}
357
+ ],
358
+ max_tokens=10,
359
+ temperature=0,
360
+ )
361
+
362
+ category = response['choices'][0]['message']['content'].strip().strip('"')
363
+
364
+ # Validate the category
365
+ if category in CATEGORIES:
366
+ bookmark['category'] = category
367
+ logger.info(f"Assigned category '{category}' to bookmark: {bookmark.get('url')}")
368
+ else:
369
+ bookmark['category'] = 'Uncategorized'
370
+ logger.warning(f"Invalid category '{category}' returned by LLM for bookmark: {bookmark.get('url')}")
371
+
372
+ return bookmark
373
+
374
+ except Exception as e:
375
+ logger.error(f"Error assigning category: {e}")
376
+ bookmark['category'] = 'Uncategorized'
377
+ return bookmark
378
 
379
  def vectorize_and_index(bookmarks_list):
380
  """
381
+ Create vector embeddings for bookmarks and build FAISS index with ID mapping.
382
  """
383
  logger.info("Vectorizing summaries and building FAISS index")
384
  try:
385
  summaries = [bookmark['summary'] for bookmark in bookmarks_list]
386
  embeddings = embedding_model.encode(summaries)
387
  dimension = embeddings.shape[1]
388
+ index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
389
+ # Assign unique IDs to each bookmark
390
+ ids = np.array([bookmark['id'] for bookmark in bookmarks_list], dtype=np.int64)
391
+ index.add_with_ids(np.array(embeddings).astype('float32'), ids)
392
+ logger.info("FAISS index built successfully with IDs")
393
+ return index
394
  except Exception as e:
395
  logger.error(f"Error in vectorizing and indexing: {e}")
396
  raise
 
417
  card_style = "border: 2px solid var(--success-color);"
418
  text_style = "color: var(--text-color);"
419
 
420
+ # Escape HTML content to prevent XSS attacks
421
+ from html import escape
422
+ title = escape(title)
423
+ url = escape(url)
424
+ summary = escape(summary)
425
+ category = escape(category)
426
+
427
  card_html = f'''
428
  <div class="card" style="{card_style}; padding: 10px; margin: 10px; border-radius: 5px;">
429
  <div class="card-content">
 
445
  """
446
  global bookmarks, faiss_index
447
  logger.info("Processing uploaded file")
448
+
449
  if file is None:
450
  logger.warning("No file uploaded")
451
  return "Please upload a bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
 
466
  logger.warning("No bookmarks found in the uploaded file")
467
  return "No bookmarks found in the uploaded file.", '', gr.update(choices=[]), display_bookmarks()
468
 
469
+ # Assign unique IDs to bookmarks
470
+ for idx, bookmark in enumerate(bookmarks):
471
+ bookmark['id'] = idx
472
+
473
  # Asynchronously fetch bookmark info
474
  try:
475
  asyncio.run(process_bookmarks_async(bookmarks))
 
483
  assign_category(bookmark)
484
 
485
  try:
486
+ faiss_index = vectorize_and_index(bookmarks)
487
  except Exception as e:
488
  logger.error(f"Error building FAISS index: {e}")
489
  return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
490
 
491
  message = f"✅ Successfully processed {len(bookmarks)} bookmarks."
492
  logger.info(message)
493
+
494
  # Generate displays and updates
495
  bookmark_html = display_bookmarks()
496
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
497
  for i, bookmark in enumerate(bookmarks)]
498
 
499
  return message, bookmark_html, gr.update(choices=choices), bookmark_html
500
 
501
  def delete_selected_bookmarks(selected_indices):
502
  """
503
+ Delete selected bookmarks and remove their vectors from the FAISS index.
504
  """
505
  global bookmarks, faiss_index
506
  if not selected_indices:
507
  return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
508
 
509
+ ids_to_delete = []
510
+ indices_to_delete = []
511
+ for s in selected_indices:
512
+ idx = int(s.split('.')[0]) - 1
513
  if 0 <= idx < len(bookmarks):
514
+ bookmark_id = bookmarks[idx]['id']
515
+ ids_to_delete.append(bookmark_id)
516
+ indices_to_delete.append(idx)
517
  logger.info(f"Deleting bookmark at index {idx + 1}")
 
518
 
519
+ # Remove vectors from FAISS index
520
+ if faiss_index is not None and ids_to_delete:
521
+ faiss_index.remove_ids(np.array(ids_to_delete, dtype=np.int64))
522
+
523
+ # Remove bookmarks from the list (reverse order to avoid index shifting)
524
+ for idx in sorted(indices_to_delete, reverse=True):
525
+ bookmarks.pop(idx)
526
 
527
  message = "🗑️ Selected bookmarks deleted successfully."
528
  logger.info(message)
529
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
530
+ for i, bookmark in enumerate(bookmarks)]
531
+
 
 
532
  return message, gr.update(choices=choices), display_bookmarks()
533
 
534
  def edit_selected_bookmarks_category(selected_indices, new_category):
 
548
 
549
  message = "✏️ Category updated for selected bookmarks."
550
  logger.info(message)
551
+
552
  # Update choices and display
553
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
554
  for i, bookmark in enumerate(bookmarks)]
555
+
556
  return message, gr.update(choices=choices), display_bookmarks()
557
 
558
  def export_bookmarks():
 
585
 
586
  def chatbot_response(user_query):
587
  """
588
+ Generate chatbot response using the FAISS index and embeddings.
589
  """
590
+ if not bookmarks or faiss_index is None:
 
 
 
 
591
  logger.warning("No bookmarks available for chatbot")
592
  return "⚠️ No bookmarks available. Please upload and process your bookmarks first."
593
 
594
  logger.info(f"Chatbot received query: {user_query}")
595
 
596
  try:
597
+ # Encode the user query
598
+ query_vector = embedding_model.encode([user_query]).astype('float32')
599
+
600
+ # Search the FAISS index
601
+ k = 5 # Number of results to return
602
+ distances, ids = faiss_index.search(query_vector, k)
603
+ ids = ids.flatten()
604
 
605
+ # Retrieve the bookmarks
606
+ id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
607
+ matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
608
+
609
+ if not matching_bookmarks:
610
+ return "No relevant bookmarks found for your query."
611
+
612
+ # Format the response
613
+ bookmarks_info = "\n".join([
614
+ f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}"
615
+ for bookmark in matching_bookmarks
616
+ ])
617
+
618
+ # Use the LLM via Groq Cloud API to generate a response
619
  prompt = f"""
620
+ A user asked: "{user_query}"
621
 
622
+ Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks.
 
623
 
624
+ Bookmarks:
625
+ {bookmarks_info}
626
 
627
+ Provide a concise and helpful response.
628
+ """
629
 
630
  response = openai.ChatCompletion.create(
631
+ model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
632
  messages=[
633
+ {"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
634
  {"role": "user", "content": prompt}
635
  ],
636
  max_tokens=500,
 
724
  delete_button = gr.Button("🗑️ Delete Selected")
725
  edit_category_button = gr.Button("✏️ Edit Category")
726
  export_button = gr.Button("💾 Export")
727
+
728
  download_link = gr.HTML(label="📥 Download")
729
 
730
  # Set up event handlers
 
764
  print(f"Error building the app: {e}")
765
 
766
  if __name__ == "__main__":
767
+ build_app()