siddhartharya commited on
Commit
813f784
β€’
1 Parent(s): 3c4e128

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +399 -650
app.py CHANGED
@@ -12,7 +12,6 @@ import base64
12
  import logging
13
  import os
14
  import sys
15
- import urllib.parse
16
 
17
  # Import OpenAI library
18
  import openai
@@ -75,121 +74,37 @@ if not GROQ_API_KEY:
75
  openai.api_key = GROQ_API_KEY
76
  openai.api_base = "https://api.groq.com/openai/v1"
77
 
78
- def determine_page_type(soup, url):
79
  """
80
- Determine the type of webpage for better content extraction.
81
- """
82
- url_lower = url.lower()
83
-
84
- # Check for common platforms
85
- if 'facebook.com' in url_lower:
86
- return 'social_media_profile'
87
- elif 'wikipedia.org' in url_lower:
88
- return 'wiki_article'
89
- elif any(domain in url_lower for domain in ['news', 'huffpost', 'times']):
90
- return 'news_article'
91
- elif 'youtube.com' in url_lower:
92
- return 'video_platform'
93
- elif '.gov' in url_lower or 'government' in url_lower:
94
- return 'government_site'
95
- elif 'x.com' in url_lower or 'twitter.com' in url_lower:
96
- return 'social_media_platform'
97
-
98
- # Check page structure
99
- if soup.find('article'):
100
- return 'article'
101
- elif soup.find(['shop', 'product', 'price']):
102
- return 'ecommerce'
103
- elif soup.find(['forum', 'comment', 'discussion']):
104
- return 'forum'
105
-
106
- return 'general'
107
-
108
- def extract_main_content_by_type(soup, page_type):
109
- """
110
- Extract content based on page type for better relevance.
111
  """
112
  if not soup:
113
  return ""
114
 
115
- content = ""
116
-
117
- if page_type == 'news_article':
118
- # Try to find the main article content
119
- article_body = soup.find(['article', 'main', 'div'],
120
- class_=lambda x: x and any(c in str(x).lower()
121
- for c in ['article', 'story', 'content', 'body']))
122
- if article_body:
123
- # Get first few paragraphs
124
- paragraphs = article_body.find_all('p')
125
- content = ' '.join(p.get_text() for p in paragraphs[:5])
126
-
127
- elif page_type == 'wiki_article':
128
- # For Wikipedia articles
129
- content_div = soup.find('div', {'id': 'mw-content-text'})
130
- if content_div:
131
- paragraphs = content_div.find_all('p')
132
- content = ' '.join(p.get_text() for p in paragraphs[:3])
133
-
134
- elif page_type in ['social_media_profile', 'social_media_platform']:
135
- # For social media pages
136
- about_section = soup.find(['div', 'section'],
137
- class_=lambda x: x and any(c in str(x).lower()
138
- for c in ['about', 'bio', 'profile', 'description']))
139
- if about_section:
140
- content = about_section.get_text()
141
- else:
142
- # Try to get main content area
143
- content = soup.find(['div', 'main'],
144
- class_=lambda x: x and 'content' in str(x).lower())
145
- if content:
146
- content = content.get_text()
147
-
148
- # If no content found using specific extractors, use general extraction
149
- if not content.strip():
150
- # Remove unwanted elements
151
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
152
- element.decompose()
153
-
154
- # Try to find main content area
155
- main_content = soup.find(['main', 'article', 'div'],
156
- class_=lambda x: x and 'content' in str(x).lower())
157
- if main_content:
158
- # Get all text from paragraphs
159
- paragraphs = main_content.find_all('p')
160
- content = ' '.join(p.get_text() for p in paragraphs)
161
  else:
162
  # Fallback to body content
163
- content = soup.get_text()
164
 
165
- # Clean the extracted content
166
- content = clean_text(content)
 
 
167
 
168
- return content[:5000] # Limit content length
169
-
170
- def clean_text(text):
171
- """
172
- Clean extracted text content.
173
- """
174
- if not text:
175
- return ""
176
-
177
- # Convert to string if necessary
178
- text = str(text)
179
-
180
- # Remove extra whitespace
181
- text = re.sub(r'\s+', ' ', text)
182
-
183
- # Remove special characters but keep basic punctuation
184
- text = re.sub(r'[^\w\s.,!?-]', '', text)
185
-
186
- # Remove multiple punctuation
187
- text = re.sub(r'([.,!?])\1+', r'\1', text)
188
-
189
- # Remove very short words (likely garbage)
190
- text = ' '.join(word for word in text.split() if len(word) > 1)
191
-
192
- return text.strip()
193
 
194
  def get_page_metadata(soup):
195
  """
@@ -204,213 +119,170 @@ def get_page_metadata(soup):
204
  if not soup:
205
  return metadata
206
 
207
- # Get title (try multiple sources)
208
  title_tag = soup.find('title')
209
- og_title = soup.find('meta', {'property': 'og:title'})
210
- twitter_title = soup.find('meta', {'name': 'twitter:title'})
211
-
212
  if title_tag and title_tag.string:
213
  metadata['title'] = title_tag.string.strip()
214
- elif og_title and og_title.get('content'):
215
- metadata['title'] = og_title.get('content').strip()
216
- elif twitter_title and twitter_title.get('content'):
217
- metadata['title'] = twitter_title.get('content').strip()
218
-
219
- # Get meta description (try multiple sources)
220
- desc_sources = [
221
- ('meta', {'name': 'description'}),
222
- ('meta', {'property': 'og:description'}),
223
- ('meta', {'name': 'twitter:description'}),
224
- ]
225
 
226
- for tag, attrs in desc_sources:
227
- desc = soup.find(tag, attrs=attrs)
228
- if desc and desc.get('content'):
229
- metadata['description'] = desc.get('content').strip()
230
- break
 
 
 
231
 
232
  # Get meta keywords
233
- keywords_tag = soup.find('meta', {'name': 'keywords'})
234
- if keywords_tag and keywords_tag.get('content'):
235
- metadata['keywords'] = keywords_tag.get('content').strip()
236
 
237
- return metadata
238
-
239
- def generate_contextual_summary(context):
240
- """
241
- Generate summary with context awareness using LLM.
242
- """
243
- page_type = context['page_type']
244
-
245
- # Customize prompt based on page type
246
- type_specific_prompts = {
247
- 'news_article': "This is a news article. Focus on the main news event, key facts, and significance.",
248
- 'wiki_article': "This is a Wikipedia article. Focus on the main topic, key facts, and historical context.",
249
- 'social_media_profile': "This is a social media profile. Focus on the platform's purpose and key features.",
250
- 'social_media_platform': "This is a social media platform. Describe its main purpose and unique features.",
251
- 'ecommerce': "This is an e-commerce site. Focus on what products/services are offered and target audience.",
252
- 'government_site': "This is a government website. Focus on services offered and public information provided.",
253
- 'video_platform': "This is a video platform. Describe its main purpose and content sharing features.",
254
- 'general': "Describe the main purpose and key features of this webpage."
255
- }
256
-
257
- prompt = f"""
258
- Analyze this webpage and create a clear, factual summary:
259
-
260
- Title: {context['title']}
261
- Type: {page_type}
262
- Description: {context['description']}
263
- Keywords: {context['keywords']}
264
-
265
- Additional Content:
266
- {context['content'][:3000]}
267
-
268
- {type_specific_prompts.get(page_type, type_specific_prompts['general'])}
269
-
270
- Create a natural, informative 2-3 sentence summary that:
271
- 1. States the primary purpose/main topic
272
- 2. Mentions key features or information
273
- 3. Indicates target audience or use case (if clear)
274
-
275
- Keep the tone professional and factual.
276
- """
277
 
278
- try:
279
- response = openai.ChatCompletion.create(
280
- model='llama3-8b-8192',
281
- messages=[
282
- {"role": "system", "content": "You are a precise webpage summarizer that creates clear, accurate summaries."},
283
- {"role": "user", "content": prompt}
284
- ],
285
- max_tokens=150,
286
- temperature=0.3,
287
- )
288
-
289
- return response['choices'][0]['message']['content'].strip()
290
- except Exception as e:
291
- logger.error(f"Error generating LLM summary: {e}")
292
- return None
293
 
294
  def generate_summary(bookmark):
295
  """
296
  Generate a comprehensive summary for a bookmark using available content and LLM.
297
  """
298
- logger.info(f"Generating summary for {bookmark.get('url')}")
299
 
300
  try:
 
301
  soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
302
 
303
- # 1. Extract all available metadata
304
  metadata = get_page_metadata(soup)
 
305
 
306
- # 2. Determine page type and context
307
- page_type = determine_page_type(soup, bookmark['url'])
308
-
309
- # 3. Extract relevant content based on page type
310
- main_content = extract_main_content_by_type(soup, page_type)
311
-
312
- # 4. Generate summary using LLM with contextual awareness
313
  try:
314
- context = {
315
- 'title': metadata['title'] or bookmark.get('title', ''),
316
- 'description': metadata['description'],
317
- 'keywords': metadata['keywords'],
318
- 'page_type': page_type,
319
- 'content': main_content
320
- }
 
 
 
321
 
322
- summary = generate_contextual_summary(context)
323
- if summary:
324
- bookmark['summary'] = summary
325
  return bookmark
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  except Exception as e:
328
- logger.error(f"Error in LLM summary generation: {e}")
329
-
330
- # Fallback mechanism
331
- if metadata['description']:
332
- bookmark['summary'] = metadata['description']
333
- elif main_content:
334
- bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
335
- else:
336
- bookmark['summary'] = metadata.get('title', bookmark.get('title', 'No summary available.'))
 
 
 
 
 
337
 
338
  except Exception as e:
339
  logger.error(f"Error in generate_summary: {e}")
340
  bookmark['summary'] = bookmark.get('title', 'No summary available.')
341
-
342
- return bookmark
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  async def fetch_url_info(session, bookmark):
345
  """
346
- Enhanced URL fetching with better error handling and request configuration.
347
  """
348
  url = bookmark['url']
349
  if url in fetch_cache:
350
  bookmark.update(fetch_cache[url])
351
  return bookmark
352
 
353
- headers = {
354
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
355
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
356
- 'Accept-Language': 'en-US,en;q=0.5',
357
- 'Accept-Encoding': 'gzip, deflate, br',
358
- 'Connection': 'keep-alive',
359
- 'Upgrade-Insecure-Requests': '1',
360
- 'Sec-Fetch-Dest': 'document',
361
- 'Sec-Fetch-Mode': 'navigate',
362
- 'Sec-Fetch-Site': 'none',
363
- 'Sec-Fetch-User': '?1',
364
- 'Cache-Control': 'max-age=0'
365
- }
366
-
367
  try:
368
  logger.info(f"Fetching URL info for: {url}")
369
- timeout = aiohttp.ClientTimeout(total=30)
370
- async with session.get(
371
- url,
372
- timeout=timeout,
373
- headers=headers,
374
- ssl=False,
375
- allow_redirects=True
376
- ) as response:
377
-
378
- status = response.status
379
- bookmark['status_code'] = status
380
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
381
-
382
- # Handle different status codes
383
- if status == 200:
384
- content = await response.text()
385
- bookmark['html_content'] = content
386
- bookmark['dead_link'] = False
387
- bookmark['description'] = '' # Will be set by generate_summary
388
- logger.info(f"Successfully fetched content for {url}")
389
- elif status in [301, 302, 307, 308]:
390
- # Handle redirects manually if needed
391
- bookmark['dead_link'] = False
392
- bookmark['html_content'] = ''
393
- logger.info(f"Redirect detected for {url}")
394
- else:
395
  bookmark['dead_link'] = True
 
396
  bookmark['html_content'] = ''
397
- logger.warning(f"Non-success status {status} for {url}")
398
-
399
- except asyncio.TimeoutError:
400
- logger.warning(f"Timeout while fetching {url}")
401
- bookmark['dead_link'] = False # Don't mark as dead just because of timeout
402
- bookmark['status_code'] = 'Timeout'
 
403
  except Exception as e:
404
- logger.error(f"Error fetching {url}: {str(e)}")
405
- bookmark['dead_link'] = False # Don't mark as dead for other errors
406
- bookmark['status_code'] = str(e)
 
 
 
407
  finally:
408
- # Ensure all required fields are present
409
- bookmark.setdefault('html_content', '')
410
- bookmark.setdefault('description', '')
411
- bookmark.setdefault('etag', 'N/A')
412
-
413
- # Update cache
414
  fetch_cache[url] = {
415
  'etag': bookmark.get('etag'),
416
  'status_code': bookmark.get('status_code'),
@@ -418,80 +290,76 @@ async def fetch_url_info(session, bookmark):
418
  'description': bookmark.get('description'),
419
  'html_content': bookmark.get('html_content', '')
420
  }
421
-
422
  return bookmark
423
 
424
  async def process_bookmarks_async(bookmarks_list):
425
  """
426
- Process all bookmarks asynchronously with improved error handling.
427
  """
428
  logger.info("Processing bookmarks asynchronously")
429
  try:
430
- # Configure connection pool and timeout
431
- tcp_connector = aiohttp.TCPConnector(
432
- limit=5, # Limit concurrent connections
433
- force_close=True, # Force close connections
434
- enable_cleanup_closed=True, # Clean up closed connections
435
- ssl=False # Disable SSL verification
436
- )
437
-
438
- timeout = aiohttp.ClientTimeout(total=30)
439
-
440
- async with aiohttp.ClientSession(
441
- connector=tcp_connector,
442
- timeout=timeout,
443
- raise_for_status=False # Don't raise exceptions for non-200 status
444
- ) as session:
445
  tasks = []
446
  for bookmark in bookmarks_list:
447
  task = asyncio.ensure_future(fetch_url_info(session, bookmark))
448
  tasks.append(task)
449
-
450
- # Process bookmarks in batches to avoid overwhelming servers
451
- batch_size = 5
452
- for i in range(0, len(tasks), batch_size):
453
- batch = tasks[i:i + batch_size]
454
- await asyncio.gather(*batch)
455
- await asyncio.sleep(1) # Small delay between batches
456
-
457
  logger.info("Completed processing bookmarks asynchronously")
458
  except Exception as e:
459
  logger.error(f"Error in asynchronous processing of bookmarks: {e}")
460
  raise
461
 
462
- def parse_bookmarks(file_content):
463
  """
464
- Parse bookmarks from HTML file with enhanced error handling.
465
  """
466
- logger.info("Parsing bookmarks")
467
- try:
468
- soup = BeautifulSoup(file_content, 'html.parser')
469
- extracted_bookmarks = []
470
-
471
- # Find all bookmark links
472
- for link in soup.find_all('a'):
473
- url = link.get('href', '').strip()
474
- title = link.text.strip()
475
-
476
- # Validate URL and title
477
- if url and title and url.startswith(('http://', 'https://')):
478
- # Clean and normalize URL
479
- parsed_url = urllib.parse.urlparse(url)
480
- normalized_url = urllib.parse.urlunparse(parsed_url)
481
-
482
- bookmark = {
483
- 'url': normalized_url,
484
- 'title': title,
485
- 'add_date': link.get('add_date', ''),
486
- 'icon': link.get('icon', '')
487
- }
488
- extracted_bookmarks.append(bookmark)
489
-
490
- logger.info(f"Extracted {len(extracted_bookmarks)} valid bookmarks")
491
- return extracted_bookmarks
492
- except Exception as e:
493
- logger.error(f"Error parsing bookmarks: {e}")
494
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
  def vectorize_and_index(bookmarks_list):
497
  """
@@ -499,25 +367,11 @@ def vectorize_and_index(bookmarks_list):
499
  """
500
  logger.info("Vectorizing summaries and building FAISS index")
501
  try:
502
- # Prepare summaries for vectorization
503
- summaries = []
504
- for bookmark in bookmarks_list:
505
- summary = bookmark.get('summary', '').strip()
506
- title = bookmark.get('title', '').strip()
507
- # Combine title and summary for better embedding
508
- text = f"{title} {summary}".strip()
509
- summaries.append(text if text else "No content available")
510
-
511
- # Generate embeddings
512
  embeddings = embedding_model.encode(summaries)
513
-
514
- # Create and configure FAISS index
515
  dimension = embeddings.shape[1]
516
  faiss_idx = faiss.IndexFlatL2(dimension)
517
-
518
- # Add vectors to index
519
  faiss_idx.add(np.array(embeddings))
520
-
521
  logger.info("FAISS index built successfully")
522
  return faiss_idx, embeddings
523
  except Exception as e:
@@ -526,7 +380,7 @@ def vectorize_and_index(bookmarks_list):
526
 
527
  def display_bookmarks():
528
  """
529
- Generate HTML display for bookmarks with enhanced styling.
530
  """
531
  logger.info("Generating HTML display for bookmarks")
532
  cards = ''
@@ -538,350 +392,201 @@ def display_bookmarks():
538
  etag = bookmark.get('etag', 'N/A')
539
  summary = bookmark.get('summary', '')
540
  category = bookmark.get('category', 'Uncategorized')
541
- status_code = bookmark.get('status_code', 'N/A')
542
 
543
- # Enhanced styling based on status
544
  if bookmark.get('dead_link'):
545
- card_style = "border: 2px solid #ff4444; background-color: rgba(255, 68, 68, 0.1);"
546
- text_style = "color: #ff4444;"
547
  else:
548
- card_style = "border: 2px solid #00C851; background-color: rgba(0, 200, 81, 0.1);"
549
  text_style = "color: var(--text-color);"
550
 
551
- # Properly escape any backslashes if present in summary or other fields
552
- # (Not strictly necessary here, but good practice)
553
- summary_escaped = summary.replace('\\', '\\\\')
554
-
555
  card_html = f'''
556
- <div class="card" style="{card_style} padding: 15px; margin: 15px 0; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
557
  <div class="card-content">
558
- <h3 style="{text_style} margin-bottom: 10px; font-size: 1.2em;">
559
- {index}. {title} {status}
560
- {f'<span style="font-size: 0.8em; color: #666;">({status_code})</span>' if status_code != 'N/A' else ''}
561
- </h3>
562
  <p style="{text_style}"><strong>Category:</strong> {category}</p>
563
  <p style="{text_style}"><strong>URL:</strong> <a href="{url}" target="_blank" style="{text_style}">{url}</a></p>
564
  <p style="{text_style}"><strong>ETag:</strong> {etag}</p>
565
- <p style="{text_style}"><strong>Summary:</strong> {summary_escaped}</p>
566
  </div>
567
  </div>
568
  '''
569
  cards += card_html
570
-
571
- # Add container with max width and padding
572
- display_html = f'''
573
- <div style="max-width: 1200px; margin: 0 auto; padding: 20px;">
574
- {cards}
575
- </div>
576
- '''
577
-
578
  logger.info("HTML display generated")
579
- return display_html
580
 
581
- def assign_category(bookmark):
582
  """
583
- Assign a category to a bookmark based on its title or summary.
584
- This is a simple implementation and can be enhanced with more sophisticated methods.
585
- """
586
- title = bookmark.get('title', '').lower()
587
- summary = bookmark.get('summary', '').lower()
588
-
589
- # Simple keyword-based categorization
590
- if any(keyword in title or keyword in summary for keyword in ['facebook', 'twitter', 'instagram']):
591
- bookmark['category'] = 'Social Media'
592
- elif any(keyword in title or keyword in summary for keyword in ['news', 'media', 'huffpost', 'times']):
593
- bookmark['category'] = 'News and Media'
594
- elif any(keyword in title or keyword in summary for keyword in ['course', 'learning', 'education']):
595
- bookmark['category'] = 'Education and Learning'
596
- elif any(keyword in title or keyword in summary for keyword in ['movie', 'music', 'audio', 'video']):
597
- bookmark['category'] = 'Entertainment'
598
- elif any(keyword in title or keyword in summary for keyword in ['shop', 'e-commerce', 'buy', 'purchase']):
599
- bookmark['category'] = 'Shopping and E-commerce'
600
- elif any(keyword in title or keyword in summary for keyword in ['finance', 'banking', 'investment']):
601
- bookmark['category'] = 'Finance and Banking'
602
- elif any(keyword in title or keyword in summary for keyword in ['tech', 'technology', 'software']):
603
- bookmark['category'] = 'Technology'
604
- elif any(keyword in title or keyword in summary for keyword in ['health', 'fitness', 'wellness']):
605
- bookmark['category'] = 'Health and Fitness'
606
- elif any(keyword in title or keyword in summary for keyword in ['travel', 'tourism', 'flight', 'hotel']):
607
- bookmark['category'] = 'Travel and Tourism'
608
- elif any(keyword in title or keyword in summary for keyword in ['recipe', 'food', 'cooking']):
609
- bookmark['category'] = 'Food and Recipes'
610
- elif any(keyword in title or keyword in summary for keyword in ['sport', 'game', 'fitness']):
611
- bookmark['category'] = 'Sports'
612
- elif any(keyword in title or keyword in summary for keyword in ['art', 'culture', 'museum']):
613
- bookmark['category'] = 'Arts and Culture'
614
- elif any(keyword in title or keyword in summary for keyword in ['gov', 'government', 'politics']):
615
- bookmark['category'] = 'Government and Politics'
616
- elif any(keyword in title or keyword in summary for keyword in ['business', 'economy', 'market']):
617
- bookmark['category'] = 'Business and Economy'
618
- elif any(keyword in title or keyword in summary for keyword in ['science', 'research', 'study']):
619
- bookmark['category'] = 'Science and Research'
620
- elif any(keyword in title or keyword in summary for keyword in ['blog', 'journal']):
621
- bookmark['category'] = 'Personal Blogs and Journals'
622
- elif any(keyword in title or keyword in summary for keyword in ['job', 'career', 'employment']):
623
- bookmark['category'] = 'Job Search and Careers'
624
- elif any(keyword in title or keyword in summary for keyword in ['audio', 'music']):
625
- bookmark['category'] = 'Music and Audio'
626
- elif any(keyword in title or keyword in summary for keyword in ['video', 'movie']):
627
- bookmark['category'] = 'Videos and Movies'
628
- elif any(keyword in title or keyword in summary for keyword in ['reference', 'knowledge', 'wiki']):
629
- bookmark['category'] = 'Reference and Knowledge Bases'
630
- elif bookmark.get('dead_link'):
631
- bookmark['category'] = 'Dead Link'
632
- else:
633
- bookmark['category'] = 'Uncategorized'
634
-
635
- def process_uploaded_file(file, delete_checkbox, edit_checkbox):
636
- """
637
- Process the uploaded bookmarks file with enhanced error handling and user feedback.
638
  """
639
  global bookmarks, faiss_index
640
  logger.info("Processing uploaded file")
641
 
642
  if file is None:
643
- return "⚠️ Please upload a bookmarks HTML file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[])
644
-
 
645
  try:
646
  file_content = file.decode('utf-8')
647
  except UnicodeDecodeError as e:
648
- logger.error(f"Error decoding file: {e}")
649
- return "⚠️ Error decoding file. Please ensure it's a valid HTML file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[])
650
-
651
  try:
652
  bookmarks = parse_bookmarks(file_content)
653
  except Exception as e:
654
  logger.error(f"Error parsing bookmarks: {e}")
655
- return "⚠️ Error parsing the bookmarks file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[])
656
-
657
  if not bookmarks:
658
- return "⚠️ No valid bookmarks found in the file.", '', gr.Dropdown.update(choices=[]), gr.Dropdown.update(choices=[])
659
-
 
 
660
  try:
661
- logger.info("Processing bookmarks...")
662
  asyncio.run(process_bookmarks_async(bookmarks))
663
-
664
- # Process in batches for progress tracking
665
- total = len(bookmarks)
666
- for i, bookmark in enumerate(bookmarks, 1):
667
- generate_summary(bookmark)
668
- assign_category(bookmark)
669
- logger.info(f"Processed bookmark {i}/{total}")
670
 
671
- faiss_index, embeddings = vectorize_and_index(bookmarks)
672
-
673
- message = f"βœ… Successfully processed {len(bookmarks)} bookmarks!"
674
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
675
- for i, bookmark in enumerate(bookmarks)]
676
-
677
- bookmark_html = display_bookmarks()
678
- return message, bookmark_html, gr.CheckboxGroup.update(choices=choices), gr.CheckboxGroup.update(choices=choices)
679
 
 
 
680
  except Exception as e:
681
- logger.error(f"Error processing bookmarks: {e}")
682
- return "⚠️ Error processing bookmarks. Please try again.", '', gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[])
 
 
 
 
 
 
 
 
 
 
683
 
684
- def delete_selected_bookmarks(selected_indices, delete_checkbox, edit_checkbox):
685
  """
686
- Delete selected bookmarks with enhanced error handling.
687
  """
688
  global bookmarks, faiss_index
689
-
690
  if not selected_indices:
691
- return "⚠️ No bookmarks selected.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks()
692
 
693
- try:
694
- indices = [int(s.split('.')[0])-1 for s in selected_indices]
695
- indices = sorted(indices, reverse=True)
696
- deleted_count = 0
697
-
698
- for idx in indices:
699
- if 0 <= idx < len(bookmarks):
700
- logger.info(f"Deleting bookmark: {bookmarks[idx]['title']}")
701
- bookmarks.pop(idx)
702
- deleted_count += 1
703
-
704
- if bookmarks:
705
- faiss_index, embeddings = vectorize_and_index(bookmarks)
706
- else:
707
- faiss_index = None
708
 
709
- message = f"βœ… Successfully deleted {deleted_count} bookmark{'s' if deleted_count != 1 else ''}."
710
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
711
- for i, bookmark in enumerate(bookmarks)]
712
-
713
- return message, gr.CheckboxGroup.update(choices=choices), gr.CheckboxGroup.update(choices=choices), display_bookmarks()
714
- except Exception as e:
715
- logger.error(f"Error deleting bookmarks: {e}")
716
- return "⚠️ Error deleting bookmarks.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks()
717
 
718
- def edit_selected_bookmarks_category(selected_indices, new_category, delete_checkbox, edit_checkbox):
 
 
 
 
 
 
 
 
 
719
  """
720
- Edit category of selected bookmarks with enhanced error handling.
721
  """
722
  if not selected_indices:
723
- return "⚠️ No bookmarks selected.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks()
724
  if not new_category:
725
- return "⚠️ No new category selected.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks()
726
 
727
- try:
728
- indices = [int(s.split('.')[0])-1 for s in selected_indices]
729
- updated_count = 0
730
-
731
- for idx in indices:
732
- if 0 <= idx < len(bookmarks):
733
- old_category = bookmarks[idx]['category']
734
- bookmarks[idx]['category'] = new_category
735
- logger.info(f"Updated category for '{bookmarks[idx]['title']}' from '{old_category}' to '{new_category}'")
736
- updated_count += 1
737
-
738
- message = f"βœ… Updated category for {updated_count} bookmark{'s' if updated_count != 1 else ''}."
739
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
740
- for i, bookmark in enumerate(bookmarks)]
741
-
742
- return message, gr.CheckboxGroup.update(choices=choices), gr.CheckboxGroup.update(choices=choices), display_bookmarks()
743
- except Exception as e:
744
- logger.error(f"Error updating categories: {e}")
745
- return "⚠️ Error updating categories.", gr.CheckboxGroup.update(choices=[]), gr.CheckboxGroup.update(choices=[]), display_bookmarks()
746
 
747
  def export_bookmarks():
748
  """
749
- Export bookmarks to HTML file with enhanced formatting.
750
  """
751
  if not bookmarks:
 
752
  return "⚠️ No bookmarks to export."
753
 
754
  try:
755
- logger.info("Exporting bookmarks")
756
- soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1>", 'html.parser')
757
-
758
- # Add metadata
759
- meta = soup.new_tag('META')
760
- meta['HTTP-EQUIV'] = 'Content-Type'
761
- meta['CONTENT'] = 'text/html; charset=UTF-8'
762
- soup.append(meta)
763
-
764
- # Add title
765
- title = soup.new_tag('TITLE')
766
- title.string = 'Bookmarks'
767
- soup.append(title)
768
-
769
- # Add heading
770
- h1 = soup.new_tag('H1')
771
- h1.string = 'Bookmarks'
772
- soup.append(h1)
773
-
774
- # Create main bookmark list
775
  dl = soup.new_tag('DL')
776
- soup.append(dl)
777
-
778
- # Add bookmarks with categories
779
- current_category = None
780
  for bookmark in bookmarks:
781
- category = bookmark.get('category', 'Uncategorized')
782
-
783
- # Create category folder if needed
784
- if category != current_category:
785
- current_category = category
786
- dt_cat = soup.new_tag('DT')
787
- h3_cat = soup.new_tag('H3')
788
- h3_cat.string = category
789
- dt_cat.append(h3_cat)
790
- dl_cat = soup.new_tag('DL')
791
- dt_cat.append(dl_cat)
792
- dl.append(dt_cat)
793
-
794
- # Add bookmark
795
  dt = soup.new_tag('DT')
796
  a = soup.new_tag('A', href=bookmark['url'])
797
- if 'add_date' in bookmark and bookmark['add_date']:
798
- a['ADD_DATE'] = bookmark['add_date']
799
- if 'icon' in bookmark and bookmark['icon']:
800
- a['ICON'] = bookmark['icon']
801
  a.string = bookmark['title']
802
  dt.append(a)
803
- dl_cat.append(dt)
804
-
805
  html_content = str(soup)
806
  b64 = base64.b64encode(html_content.encode()).decode()
807
  href = f'data:text/html;base64,{b64}'
808
-
809
  logger.info("Bookmarks exported successfully")
810
- return f'''
811
- <div style="text-align: center;">
812
- <a href="{href}"
813
- download="bookmarks.html"
814
- style="display: inline-block;
815
- padding: 10px 20px;
816
- background-color: #4CAF50;
817
- color: white;
818
- text-decoration: none;
819
- border-radius: 5px;
820
- margin: 10px;">
821
- πŸ’Ύ Download Exported Bookmarks
822
- </a>
823
- </div>
824
- '''
825
  except Exception as e:
826
  logger.error(f"Error exporting bookmarks: {e}")
827
  return "⚠️ Error exporting bookmarks."
828
 
829
  def chatbot_response(user_query):
830
  """
831
- Generate chatbot response with enhanced context understanding.
832
  """
833
  if not GROQ_API_KEY:
 
834
  return "⚠️ API key not set. Please set the GROQ_API_KEY environment variable."
835
 
836
  if not bookmarks:
 
837
  return "⚠️ No bookmarks available. Please upload and process your bookmarks first."
838
 
839
- logger.info(f"Processing query: {user_query}")
840
 
841
  try:
842
- # Get relevant bookmarks using FAISS
843
- query_embedding = embedding_model.encode([user_query]).astype('float32')
844
- k = min(5, len(bookmarks)) # Get top 5 or all if less than 5
845
- D, I = faiss_index.search(query_embedding, k)
846
-
847
- relevant_bookmarks = []
848
- for idx in I[0]:
849
- if idx != -1: # Valid index
850
- bookmark_data = bookmarks[idx]
851
- relevant_bookmarks.append({
852
- 'title': bookmark_data['title'],
853
- 'url': bookmark_data['url'],
854
- 'summary': bookmark_data['summary'],
855
- 'category': bookmark_data['category']
856
- })
857
-
858
- # Prepare context for LLM
859
- bookmark_descriptions = []
860
- for i, bm in enumerate(relevant_bookmarks, 1):
861
- desc = f"{i}. Title: {bm['title']}\n URL: {bm['url']}\n Category: {bm['category']}\n Summary: {bm['summary']}"
862
- bookmark_descriptions.append(desc)
863
-
864
- # Precompute the joined descriptions to avoid backslashes in f-string expressions
865
- joined_bookmark_descriptions = '\\n\\n'.join(bookmark_descriptions)
866
 
867
  prompt = f"""
868
- User Query: {user_query}
869
 
870
- Relevant Bookmarks:
871
- {joined_bookmark_descriptions}
872
 
873
- Please provide a helpful response that:
874
- 1. Identifies the most relevant bookmarks for the query
875
- 2. Explains why each bookmark might be useful
876
- 3. Suggests how the user might use these resources
877
 
878
- Format the response in a clear, readable way with appropriate spacing and structure.
879
  """
880
 
881
  response = openai.ChatCompletion.create(
882
  model='llama3-8b-8192',
883
  messages=[
884
- {"role": "system", "content": "You are a helpful assistant that finds and explains relevant bookmarks."},
885
  {"role": "user", "content": prompt}
886
  ],
887
  max_tokens=500,
@@ -889,7 +594,7 @@ def chatbot_response(user_query):
889
  )
890
 
891
  answer = response['choices'][0]['message']['content'].strip()
892
- logger.info("Generated response successfully")
893
  return answer
894
 
895
  except Exception as e:
@@ -899,69 +604,113 @@ def chatbot_response(user_query):
899
 
900
  def build_app():
901
  """
902
- Build and launch the Gradio app with enhanced UI and functionality.
903
  """
904
  try:
905
  logger.info("Building Gradio app")
906
  with gr.Blocks(css="app.css") as demo:
907
- gr.Markdown("# πŸ“š Bookmark Manager")
908
-
909
- with gr.Row():
910
- with gr.Column():
911
- file_input = gr.File(label="Upload Bookmarks HTML File", file_types=["file"])
912
- process_button = gr.Button("Process Bookmarks")
913
- process_message = gr.Markdown("")
914
-
915
- category_dropdown = gr.Dropdown(choices=CATEGORIES, label="New Category")
916
- edit_button = gr.Button("Edit Selected Bookmarks Category")
917
-
918
- delete_button = gr.Button("Delete Selected Bookmarks")
919
- export_button = gr.Button("Export Bookmarks")
920
-
921
- # Define CheckboxGroups and assign to variables
922
- delete_checkbox = gr.CheckboxGroup(label="Select Bookmarks to Delete", choices=[])
923
- edit_checkbox = gr.CheckboxGroup(label="Select Bookmarks to Edit", choices=[])
924
-
925
- with gr.Column():
926
- bookmarks_display = gr.HTML(label="Bookmarks")
927
-
928
- with gr.Row():
929
- chatbot_input = gr.Textbox(label="Ask about your bookmarks", placeholder="Enter your query here...")
930
- chatbot_output = gr.Textbox(label="Chatbot Response", interactive=False)
931
-
932
- # Processing File
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933
  process_button.click(
934
- fn=process_uploaded_file,
935
- inputs=[file_input, delete_checkbox, edit_checkbox],
936
- outputs=[process_message, bookmarks_display, delete_checkbox, edit_checkbox]
937
  )
938
-
939
- # Deleting Bookmarks
 
 
 
 
 
940
  delete_button.click(
941
- fn=delete_selected_bookmarks,
942
- inputs=[delete_checkbox, edit_checkbox],
943
- outputs=[process_message, delete_checkbox, edit_checkbox, bookmarks_display]
944
  )
945
-
946
- # Editing Categories
947
- edit_button.click(
948
- fn=edit_selected_bookmarks_category,
949
- inputs=[edit_checkbox, category_dropdown, delete_checkbox, edit_checkbox],
950
- outputs=[process_message, delete_checkbox, edit_checkbox, bookmarks_display]
951
  )
952
-
953
- # Exporting Bookmarks
954
  export_button.click(
955
- fn=export_bookmarks,
956
- inputs=None,
957
- outputs=gr.HTML(label="Export")
958
- )
959
-
960
- # Chatbot
961
- chatbot_input.submit(
962
- fn=chatbot_response,
963
- inputs=chatbot_input,
964
- outputs=chatbot_output
965
  )
966
 
967
  logger.info("Launching Gradio app")
@@ -971,4 +720,4 @@ def build_app():
971
  print(f"Error building the app: {e}")
972
 
973
  if __name__ == "__main__":
974
- build_app()
 
12
  import logging
13
  import os
14
  import sys
 
15
 
16
  # Import OpenAI library
17
  import openai
 
74
  openai.api_key = GROQ_API_KEY
75
  openai.api_base = "https://api.groq.com/openai/v1"
76
 
77
+ def extract_main_content(soup):
78
  """
79
+ Extract the main content from a webpage while filtering out boilerplate content.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  """
81
  if not soup:
82
  return ""
83
 
84
+ # Remove script and style elements
85
+ for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
86
+ element.decompose()
87
+
88
+ # First try to find content in main content areas
89
+ main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
90
+ if main_content_tags:
91
+ content = ' '.join([tag.get_text(strip=True, separator=' ') for tag in main_content_tags])
92
+ else:
93
+ # Try to find content in <p> tags
94
+ p_tags = soup.find_all('p')
95
+ if p_tags:
96
+ content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  else:
98
  # Fallback to body content
99
+ content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
100
 
101
+ # Clean up the text
102
+ content = ' '.join(content.split())
103
+ content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
104
+ content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
105
 
106
+ # Limit content length to avoid token limits (adjust as needed)
107
+ return content[:5000]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  def get_page_metadata(soup):
110
  """
 
119
  if not soup:
120
  return metadata
121
 
122
+ # Get title
123
  title_tag = soup.find('title')
 
 
 
124
  if title_tag and title_tag.string:
125
  metadata['title'] = title_tag.string.strip()
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ # Get meta description (try multiple variants)
128
+ meta_desc = (
129
+ soup.find('meta', attrs={'name': 'description'}) or
130
+ soup.find('meta', attrs={'property': 'og:description'}) or
131
+ soup.find('meta', attrs={'name': 'twitter:description'})
132
+ )
133
+ if meta_desc:
134
+ metadata['description'] = meta_desc.get('content', '').strip()
135
 
136
  # Get meta keywords
137
+ meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
138
+ if meta_keywords:
139
+ metadata['keywords'] = meta_keywords.get('content', '').strip()
140
 
141
+ # Get OG title if main title is empty
142
+ if not metadata['title']:
143
+ og_title = soup.find('meta', attrs={'property': 'og:title'})
144
+ if og_title:
145
+ metadata['title'] = og_title.get('content', '').strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ return metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  def generate_summary(bookmark):
150
  """
151
  Generate a comprehensive summary for a bookmark using available content and LLM.
152
  """
153
+ logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
154
 
155
  try:
156
+ # Get the HTML soup object from the bookmark if it exists
157
  soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
158
 
159
+ # Step 1: Extract all available information
160
  metadata = get_page_metadata(soup)
161
+ main_content = extract_main_content(soup)
162
 
163
+ # Step 2: Generate summary using LLM with all available content
 
 
 
 
 
 
164
  try:
165
+ # Prepare comprehensive context for LLM
166
+ available_content = []
167
+ if metadata['title']:
168
+ available_content.append(f"Title: {metadata['title']}")
169
+ if metadata['description']:
170
+ available_content.append(f"Description: {metadata['description']}")
171
+ if metadata['keywords']:
172
+ available_content.append(f"Keywords: {metadata['keywords']}")
173
+ if main_content:
174
+ available_content.append(f"Main Content: {main_content}")
175
 
176
+ if not available_content:
177
+ logger.warning("No content available for summary generation")
178
+ bookmark['summary'] = bookmark.get('title', 'No summary available.')
179
  return bookmark
180
 
181
+ prompt = f"""
182
+ Analyze and summarize this webpage based on the following information:
183
+
184
+ {' | '.join(available_content)}
185
+
186
+ Please provide a concise summary (2-3 sentences) focusing on:
187
+ 1. The main purpose or topic of the page
188
+ 2. Key information or features
189
+ 3. Target audience or use case (if apparent)
190
+
191
+ Be factual and objective.
192
+ """
193
+
194
+ response = openai.ChatCompletion.create(
195
+ model='llama3-8b-8192',
196
+ messages=[
197
+ {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
198
+ {"role": "user", "content": prompt}
199
+ ],
200
+ max_tokens=150,
201
+ temperature=0.5,
202
+ )
203
+
204
+ summary = response['choices'][0]['message']['content'].strip()
205
+ logger.info("Successfully generated LLM summary")
206
+ bookmark['summary'] = summary
207
+ return bookmark
208
+
209
  except Exception as e:
210
+ logger.error(f"Error generating LLM summary: {e}")
211
+ # Fallback mechanisms in order of preference
212
+ if metadata['description']:
213
+ logger.info("Falling back to meta description")
214
+ bookmark['summary'] = metadata['description']
215
+ elif main_content:
216
+ logger.info("Falling back to truncated main content")
217
+ bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
218
+ elif metadata['title']:
219
+ logger.info("Falling back to title")
220
+ bookmark['summary'] = metadata['title']
221
+ else:
222
+ bookmark['summary'] = bookmark.get('title', 'No summary available.')
223
+ return bookmark
224
 
225
  except Exception as e:
226
  logger.error(f"Error in generate_summary: {e}")
227
  bookmark['summary'] = bookmark.get('title', 'No summary available.')
228
+ return bookmark
229
+
230
+ def parse_bookmarks(file_content):
231
+ """
232
+ Parse bookmarks from HTML file.
233
+ """
234
+ logger.info("Parsing bookmarks")
235
+ try:
236
+ soup = BeautifulSoup(file_content, 'html.parser')
237
+ extracted_bookmarks = []
238
+ for link in soup.find_all('a'):
239
+ url = link.get('href')
240
+ title = link.text.strip()
241
+ if url and title:
242
+ extracted_bookmarks.append({'url': url, 'title': title})
243
+ logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
244
+ return extracted_bookmarks
245
+ except Exception as e:
246
+ logger.error("Error parsing bookmarks: %s", e)
247
+ raise
248
 
249
  async def fetch_url_info(session, bookmark):
250
  """
251
+ Fetch information about a URL asynchronously.
252
  """
253
  url = bookmark['url']
254
  if url in fetch_cache:
255
  bookmark.update(fetch_cache[url])
256
  return bookmark
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  try:
259
  logger.info(f"Fetching URL info for: {url}")
260
+ headers = {
261
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
262
+ }
263
+ async with session.get(url, timeout=10, headers=headers) as response:
 
 
 
 
 
 
 
264
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
265
+ bookmark['status_code'] = response.status
266
+
267
+ if response.status >= 400:
 
 
 
 
 
 
 
 
 
 
 
268
  bookmark['dead_link'] = True
269
+ bookmark['description'] = ''
270
  bookmark['html_content'] = ''
271
+ logger.warning(f"Dead link detected: {url} with status {response.status}")
272
+ else:
273
+ bookmark['dead_link'] = False
274
+ content = await response.text()
275
+ bookmark['html_content'] = content # Store full HTML for summary generation
276
+ bookmark['description'] = '' # Will be set by generate_summary function
277
+ logger.info(f"Fetched information for {url}")
278
  except Exception as e:
279
+ bookmark['dead_link'] = True
280
+ bookmark['etag'] = 'N/A'
281
+ bookmark['status_code'] = 'N/A'
282
+ bookmark['description'] = ''
283
+ bookmark['html_content'] = ''
284
+ logger.error(f"Error fetching URL info for {url}: {e}")
285
  finally:
 
 
 
 
 
 
286
  fetch_cache[url] = {
287
  'etag': bookmark.get('etag'),
288
  'status_code': bookmark.get('status_code'),
 
290
  'description': bookmark.get('description'),
291
  'html_content': bookmark.get('html_content', '')
292
  }
 
293
  return bookmark
294
 
295
  async def process_bookmarks_async(bookmarks_list):
296
  """
297
+ Process all bookmarks asynchronously.
298
  """
299
  logger.info("Processing bookmarks asynchronously")
300
  try:
301
+ connector = aiohttp.TCPConnector(limit=5) # Limit concurrent connections
302
+ timeout = aiohttp.ClientTimeout(total=30) # Set timeout
303
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
 
 
 
 
 
 
 
 
 
 
 
 
304
  tasks = []
305
  for bookmark in bookmarks_list:
306
  task = asyncio.ensure_future(fetch_url_info(session, bookmark))
307
  tasks.append(task)
308
+ await asyncio.gather(*tasks)
 
 
 
 
 
 
 
309
  logger.info("Completed processing bookmarks asynchronously")
310
  except Exception as e:
311
  logger.error(f"Error in asynchronous processing of bookmarks: {e}")
312
  raise
313
 
314
+ def assign_category(bookmark):
315
  """
316
+ Assign a category to a bookmark based on its content.
317
  """
318
+ if bookmark.get('dead_link'):
319
+ bookmark['category'] = 'Dead Link'
320
+ logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}")
321
+ return bookmark
322
+
323
+ summary = bookmark.get('summary', '').lower()
324
+ assigned_category = 'Uncategorized'
325
+
326
+ # Keywords associated with each category
327
+ category_keywords = {
328
+ "Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"],
329
+ "News and Media": ["news", "journalism", "media", "headlines", "breaking news"],
330
+ "Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"],
331
+ "Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"],
332
+ "Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"],
333
+ "Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"],
334
+ "Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"],
335
+ "Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"],
336
+ "Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"],
337
+ "Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"],
338
+ "Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"],
339
+ "Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"],
340
+ "Government and Politics": ["government", "politics", "policy", "election", "public service"],
341
+ "Business and Economy": ["business", "corporate", "industry", "economy", "markets"],
342
+ "Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"],
343
+ "Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"],
344
+ "Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"],
345
+ "Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"],
346
+ "Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"],
347
+ "Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"],
348
+ }
349
+
350
+ for category, keywords in category_keywords.items():
351
+ for keyword in keywords:
352
+ if re.search(r'\b' + re.escape(keyword) + r'\b', summary):
353
+ assigned_category = category
354
+ logger.info(f"Assigned category '{assigned_category}' to bookmark: {bookmark.get('url')}")
355
+ break
356
+ if assigned_category != 'Uncategorized':
357
+ break
358
+
359
+ bookmark['category'] = assigned_category
360
+ if assigned_category == 'Uncategorized':
361
+ logger.info(f"No matching category found for bookmark: {bookmark.get('url')}")
362
+ return bookmark
363
 
364
  def vectorize_and_index(bookmarks_list):
365
  """
 
367
  """
368
  logger.info("Vectorizing summaries and building FAISS index")
369
  try:
370
+ summaries = [bookmark['summary'] for bookmark in bookmarks_list]
 
 
 
 
 
 
 
 
 
371
  embeddings = embedding_model.encode(summaries)
 
 
372
  dimension = embeddings.shape[1]
373
  faiss_idx = faiss.IndexFlatL2(dimension)
 
 
374
  faiss_idx.add(np.array(embeddings))
 
375
  logger.info("FAISS index built successfully")
376
  return faiss_idx, embeddings
377
  except Exception as e:
 
380
 
381
  def display_bookmarks():
382
  """
383
+ Generate HTML display for bookmarks.
384
  """
385
  logger.info("Generating HTML display for bookmarks")
386
  cards = ''
 
392
  etag = bookmark.get('etag', 'N/A')
393
  summary = bookmark.get('summary', '')
394
  category = bookmark.get('category', 'Uncategorized')
 
395
 
 
396
  if bookmark.get('dead_link'):
397
+ card_style = "border: 2px solid var(--error-color);"
398
+ text_style = "color: var(--error-color);"
399
  else:
400
+ card_style = "border: 2px solid var(--success-color);"
401
  text_style = "color: var(--text-color);"
402
 
 
 
 
 
403
  card_html = f'''
404
+ <div class="card" style="{card_style}; padding: 10px; margin: 10px; border-radius: 5px;">
405
  <div class="card-content">
406
+ <h3 style="{text_style}">{index}. {title} {status}</h3>
 
 
 
407
  <p style="{text_style}"><strong>Category:</strong> {category}</p>
408
  <p style="{text_style}"><strong>URL:</strong> <a href="{url}" target="_blank" style="{text_style}">{url}</a></p>
409
  <p style="{text_style}"><strong>ETag:</strong> {etag}</p>
410
+ <p style="{text_style}"><strong>Summary:</strong> {summary}</p>
411
  </div>
412
  </div>
413
  '''
414
  cards += card_html
 
 
 
 
 
 
 
 
415
  logger.info("HTML display generated")
416
+ return cards
417
 
418
+ def process_uploaded_file(file):
419
  """
420
+ Process the uploaded bookmarks file.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  """
422
  global bookmarks, faiss_index
423
  logger.info("Processing uploaded file")
424
 
425
  if file is None:
426
+ logger.warning("No file uploaded")
427
+ return "Please upload a bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
428
+
429
  try:
430
  file_content = file.decode('utf-8')
431
  except UnicodeDecodeError as e:
432
+ logger.error(f"Error decoding the file: {e}")
433
+ return "Error decoding the file. Please ensure it's a valid HTML file.", '', gr.update(choices=[]), display_bookmarks()
434
+
435
  try:
436
  bookmarks = parse_bookmarks(file_content)
437
  except Exception as e:
438
  logger.error(f"Error parsing bookmarks: {e}")
439
+ return "Error parsing the bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
440
+
441
  if not bookmarks:
442
+ logger.warning("No bookmarks found in the uploaded file")
443
+ return "No bookmarks found in the uploaded file.", '', gr.update(choices=[]), display_bookmarks()
444
+
445
+ # Asynchronously fetch bookmark info
446
  try:
 
447
  asyncio.run(process_bookmarks_async(bookmarks))
448
+ except Exception as e:
449
+ logger.error(f"Error processing bookmarks asynchronously: {e}")
450
+ return "Error processing bookmarks.", '', gr.update(choices=[]), display_bookmarks()
 
 
 
 
451
 
452
+ # Generate summaries and assign categories
453
+ for bookmark in bookmarks:
454
+ generate_summary(bookmark)
455
+ assign_category(bookmark)
 
 
 
 
456
 
457
+ try:
458
+ faiss_index, embeddings = vectorize_and_index(bookmarks)
459
  except Exception as e:
460
+ logger.error(f"Error building FAISS index: {e}")
461
+ return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
462
+
463
+ message = f"βœ… Successfully processed {len(bookmarks)} bookmarks."
464
+ logger.info(message)
465
+
466
+ # Generate displays and updates
467
+ bookmark_html = display_bookmarks()
468
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
469
+ for i, bookmark in enumerate(bookmarks)]
470
+
471
+ return message, bookmark_html, gr.update(choices=choices), bookmark_html
472
 
473
+ def delete_selected_bookmarks(selected_indices):
474
  """
475
+ Delete selected bookmarks.
476
  """
477
  global bookmarks, faiss_index
 
478
  if not selected_indices:
479
+ return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
480
 
481
+ indices = [int(s.split('.')[0])-1 for s in selected_indices]
482
+ indices = sorted(indices, reverse=True)
483
+ for idx in indices:
484
+ if 0 <= idx < len(bookmarks):
485
+ logger.info(f"Deleting bookmark at index {idx + 1}")
486
+ bookmarks.pop(idx)
 
 
 
 
 
 
 
 
 
487
 
488
+ if bookmarks:
489
+ faiss_index, embeddings = vectorize_and_index(bookmarks)
490
+ else:
491
+ faiss_index = None
 
 
 
 
492
 
493
+ message = "πŸ—‘οΈ Selected bookmarks deleted successfully."
494
+ logger.info(message)
495
+
496
+ # Update choices and display
497
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
498
+ for i, bookmark in enumerate(bookmarks)]
499
+
500
+ return message, gr.update(choices=choices), display_bookmarks()
501
+
502
+ def edit_selected_bookmarks_category(selected_indices, new_category):
503
  """
504
+ Edit category of selected bookmarks.
505
  """
506
  if not selected_indices:
507
+ return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
508
  if not new_category:
509
+ return "⚠️ No new category selected.", gr.update(choices=[]), display_bookmarks()
510
 
511
+ indices = [int(s.split('.')[0])-1 for s in selected_indices]
512
+ for idx in indices:
513
+ if 0 <= idx < len(bookmarks):
514
+ bookmarks[idx]['category'] = new_category
515
+ logger.info(f"Updated category for bookmark {idx + 1} to {new_category}")
516
+
517
+ message = "✏️ Category updated for selected bookmarks."
518
+ logger.info(message)
519
+
520
+ # Update choices and display
521
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
522
+ for i, bookmark in enumerate(bookmarks)]
523
+
524
+ return message, gr.update(choices=choices), display_bookmarks()
 
 
 
 
 
525
 
526
  def export_bookmarks():
527
  """
528
+ Export bookmarks to HTML file.
529
  """
530
  if not bookmarks:
531
+ logger.warning("No bookmarks to export")
532
  return "⚠️ No bookmarks to export."
533
 
534
  try:
535
+ logger.info("Exporting bookmarks to HTML")
536
+ soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1><Title>Bookmarks</Title><H1>Bookmarks</H1>", 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  dl = soup.new_tag('DL')
 
 
 
 
538
  for bookmark in bookmarks:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  dt = soup.new_tag('DT')
540
  a = soup.new_tag('A', href=bookmark['url'])
 
 
 
 
541
  a.string = bookmark['title']
542
  dt.append(a)
543
+ dl.append(dt)
544
+ soup.append(dl)
545
  html_content = str(soup)
546
  b64 = base64.b64encode(html_content.encode()).decode()
547
  href = f'data:text/html;base64,{b64}'
 
548
  logger.info("Bookmarks exported successfully")
549
+ return f'<a href="{href}" download="bookmarks.html">πŸ’Ύ Download Exported Bookmarks</a>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  except Exception as e:
551
  logger.error(f"Error exporting bookmarks: {e}")
552
  return "⚠️ Error exporting bookmarks."
553
 
554
  def chatbot_response(user_query):
555
  """
556
+ Generate chatbot response using Groq Cloud API.
557
  """
558
  if not GROQ_API_KEY:
559
+ logger.warning("GROQ_API_KEY not set.")
560
  return "⚠️ API key not set. Please set the GROQ_API_KEY environment variable."
561
 
562
  if not bookmarks:
563
+ logger.warning("No bookmarks available for chatbot")
564
  return "⚠️ No bookmarks available. Please upload and process your bookmarks first."
565
 
566
+ logger.info(f"Chatbot received query: {user_query}")
567
 
568
  try:
569
+ max_bookmarks = 50
570
+ bookmark_data = ""
571
+ for idx, bookmark in enumerate(bookmarks[:max_bookmarks]):
572
+ bookmark_data += f"{idx+1}. Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
 
574
  prompt = f"""
575
+ You are an assistant that helps users find relevant bookmarks from their collection based on their queries.
576
 
577
+ User Query:
578
+ {user_query}
579
 
580
+ Bookmarks:
581
+ {bookmark_data}
 
 
582
 
583
+ Please identify the most relevant bookmarks that match the user's query. Provide a concise list including the index, title, URL, and a brief summary.
584
  """
585
 
586
  response = openai.ChatCompletion.create(
587
  model='llama3-8b-8192',
588
  messages=[
589
+ {"role": "system", "content": "You help users find relevant bookmarks based on their queries."},
590
  {"role": "user", "content": prompt}
591
  ],
592
  max_tokens=500,
 
594
  )
595
 
596
  answer = response['choices'][0]['message']['content'].strip()
597
+ logger.info("Chatbot response generated using Groq Cloud API")
598
  return answer
599
 
600
  except Exception as e:
 
604
 
605
  def build_app():
606
  """
607
+ Build and launch the Gradio app.
608
  """
609
  try:
610
  logger.info("Building Gradio app")
611
  with gr.Blocks(css="app.css") as demo:
612
+ # General Overview
613
+ gr.Markdown("""
614
+ # πŸ“š SmartMarks - AI Browser Bookmarks Manager
615
+
616
+ Welcome to **SmartMarks**, your intelligent assistant for managing browser bookmarks. SmartMarks leverages AI to help you organize, search, and interact with your bookmarks seamlessly.
617
+
618
+ ---
619
+
620
+ ## πŸš€ **How to Use SmartMarks**
621
+
622
+ SmartMarks is divided into three main sections:
623
+
624
+ 1. **πŸ“‚ Upload and Process Bookmarks:** Import your existing bookmarks and let SmartMarks analyze and categorize them for you.
625
+ 2. **πŸ’¬ Chat with Bookmarks:** Interact with your bookmarks using natural language queries to find relevant links effortlessly.
626
+ 3. **πŸ› οΈ Manage Bookmarks:** View, edit, delete, and export your bookmarks with ease.
627
+ """)
628
+
629
+ # Upload and Process Bookmarks Tab
630
+ with gr.Tab("Upload and Process Bookmarks"):
631
+ gr.Markdown("""
632
+ ## πŸ“‚ **Upload and Process Bookmarks**
633
+
634
+ ### πŸ“ **Steps:**
635
+ 1. Click on the "Upload Bookmarks HTML File" button
636
+ 2. Select your bookmarks file
637
+ 3. Click "Process Bookmarks" to analyze and organize your bookmarks
638
+ """)
639
+
640
+ upload = gr.File(label="πŸ“ Upload Bookmarks HTML File", type='binary')
641
+ process_button = gr.Button("βš™οΈ Process Bookmarks")
642
+ output_text = gr.Textbox(label="βœ… Output", interactive=False)
643
+ bookmark_display = gr.HTML(label="πŸ“„ Processed Bookmarks")
644
+
645
+ # Chat with Bookmarks Tab
646
+ with gr.Tab("Chat with Bookmarks"):
647
+ gr.Markdown("""
648
+ ## πŸ’¬ **Chat with Bookmarks**
649
+
650
+ Ask questions about your bookmarks and get relevant results.
651
+ """)
652
+
653
+ user_input = gr.Textbox(
654
+ label="✍️ Ask about your bookmarks",
655
+ placeholder="e.g., Do I have any bookmarks about AI?"
656
+ )
657
+ chat_button = gr.Button("πŸ“¨ Send")
658
+ chat_output = gr.Textbox(label="πŸ’¬ Response", interactive=False)
659
+
660
+ # Manage Bookmarks Tab
661
+ with gr.Tab("Manage Bookmarks"):
662
+ gr.Markdown("""
663
+ ## πŸ› οΈ **Manage Bookmarks**
664
+ Select bookmarks to delete or edit their categories.
665
+ """)
666
+
667
+ manage_output = gr.Textbox(label="πŸ”„ Status", interactive=False)
668
+ bookmark_selector = gr.CheckboxGroup(
669
+ label="βœ… Select Bookmarks",
670
+ choices=[]
671
+ )
672
+ new_category = gr.Dropdown(
673
+ label="πŸ†• New Category",
674
+ choices=CATEGORIES,
675
+ value="Uncategorized"
676
+ )
677
+ bookmark_display_manage = gr.HTML(label="πŸ“„ Bookmarks")
678
+
679
+ with gr.Row():
680
+ delete_button = gr.Button("πŸ—‘οΈ Delete Selected")
681
+ edit_category_button = gr.Button("✏️ Edit Category")
682
+ export_button = gr.Button("πŸ’Ύ Export")
683
+
684
+ download_link = gr.HTML(label="πŸ“₯ Download")
685
+
686
+ # Set up event handlers
687
  process_button.click(
688
+ process_uploaded_file,
689
+ inputs=upload,
690
+ outputs=[output_text, bookmark_display, bookmark_selector, bookmark_display_manage]
691
  )
692
+
693
+ chat_button.click(
694
+ chatbot_response,
695
+ inputs=user_input,
696
+ outputs=chat_output
697
+ )
698
+
699
  delete_button.click(
700
+ delete_selected_bookmarks,
701
+ inputs=bookmark_selector,
702
+ outputs=[manage_output, bookmark_selector, bookmark_display_manage]
703
  )
704
+
705
+ edit_category_button.click(
706
+ edit_selected_bookmarks_category,
707
+ inputs=[bookmark_selector, new_category],
708
+ outputs=[manage_output, bookmark_selector, bookmark_display_manage]
 
709
  )
710
+
 
711
  export_button.click(
712
+ export_bookmarks,
713
+ outputs=download_link
 
 
 
 
 
 
 
 
714
  )
715
 
716
  logger.info("Launching Gradio app")
 
720
  print(f"Error building the app: {e}")
721
 
722
  if __name__ == "__main__":
723
+ build_app()