siddhartharya commited on
Commit
f42e018
Β·
verified Β·
1 Parent(s): 6cc7878

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -92
app.py CHANGED
@@ -78,22 +78,33 @@ def extract_main_content(soup):
78
  """
79
  Extract the main content from a webpage while filtering out boilerplate content.
80
  """
 
 
 
81
  # Remove script and style elements
82
  for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
83
  element.decompose()
84
 
85
- # Get text from specific content tags first
86
- main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post'])
87
  if main_content_tags:
88
  content = ' '.join([tag.get_text(strip=True, separator=' ') for tag in main_content_tags])
89
  else:
90
- # Fallback to body content
91
- content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
 
 
 
 
 
92
 
93
  # Clean up the text
94
  content = ' '.join(content.split())
95
- # Limit content length to avoid token limits
96
- return content[:3000]
 
 
 
97
 
98
  def get_page_metadata(soup):
99
  """
@@ -105,14 +116,20 @@ def get_page_metadata(soup):
105
  'keywords': ''
106
  }
107
 
 
 
 
108
  # Get title
109
  title_tag = soup.find('title')
110
- if title_tag:
111
  metadata['title'] = title_tag.string.strip()
112
 
113
- # Get meta description
114
- meta_desc = soup.find('meta', attrs={'name': 'description'}) or \
115
- soup.find('meta', attrs={'property': 'og:description'})
 
 
 
116
  if meta_desc:
117
  metadata['description'] = meta_desc.get('content', '').strip()
118
 
@@ -121,6 +138,12 @@ def get_page_metadata(soup):
121
  if meta_keywords:
122
  metadata['keywords'] = meta_keywords.get('content', '').strip()
123
 
 
 
 
 
 
 
124
  return metadata
125
 
126
  def generate_summary(bookmark):
@@ -133,37 +156,39 @@ def generate_summary(bookmark):
133
  # Get the HTML soup object from the bookmark if it exists
134
  soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
135
 
136
- # Step 1: Try to get description from metadata
137
  metadata = get_page_metadata(soup)
138
- if metadata['description']:
139
- logger.info("Using meta description for summary")
140
- bookmark['summary'] = metadata['description']
141
- return bookmark
142
 
143
- # Step 2: If no description, extract main content
144
- content = extract_main_content(soup)
145
- if not content:
146
- logger.warning("No content extracted from page")
147
- # Fallback to title if available
148
  if metadata['title']:
149
- bookmark['summary'] = f"Page title: {metadata['title']}"
150
- return bookmark
 
 
 
 
 
151
 
152
- bookmark['summary'] = bookmark.get('title', 'No summary available.')
153
- return bookmark
154
-
155
- # Step 3: Generate summary using LLM
156
- try:
157
- # Prepare context for LLM
158
  prompt = f"""
159
- Webpage Title: {metadata['title']}
160
- Keywords: {metadata['keywords']}
 
161
 
162
- Content:
163
- {content}
 
 
164
 
165
- Please provide a concise summary (2-3 sentences) of this webpage's main content.
166
- Focus on what the page is about and its key information. Be factual and objective.
167
  """
168
 
169
  response = openai.ChatCompletion.create(
@@ -183,13 +208,22 @@ def generate_summary(bookmark):
183
 
184
  except Exception as e:
185
  logger.error(f"Error generating LLM summary: {e}")
186
- # Fallback to extracted content
187
- bookmark['summary'] = ' '.join(content.split()[:50]) + '...'
 
 
 
 
 
 
 
 
 
 
188
  return bookmark
189
 
190
  except Exception as e:
191
  logger.error(f"Error in generate_summary: {e}")
192
- # Final fallback
193
  bookmark['summary'] = bookmark.get('title', 'No summary available.')
194
  return bookmark
195
 
@@ -223,18 +257,22 @@ async def fetch_url_info(session, bookmark):
223
 
224
  try:
225
  logger.info(f"Fetching URL info for: {url}")
226
- async with session.get(url, timeout=10) as response:
 
 
 
227
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
228
  bookmark['status_code'] = response.status
229
 
230
  if response.status >= 400:
231
  bookmark['dead_link'] = True
232
  bookmark['description'] = ''
 
233
  logger.warning(f"Dead link detected: {url} with status {response.status}")
234
  else:
235
  bookmark['dead_link'] = False
236
  content = await response.text()
237
- bookmark['html_content'] = content
238
  bookmark['description'] = '' # Will be set by generate_summary function
239
  logger.info(f"Fetched information for {url}")
240
  except Exception as e:
@@ -260,7 +298,9 @@ async def process_bookmarks_async(bookmarks_list):
260
  """
261
  logger.info("Processing bookmarks asynchronously")
262
  try:
263
- async with aiohttp.ClientSession() as session:
 
 
264
  tasks = []
265
  for bookmark in bookmarks_list:
266
  task = asyncio.ensure_future(fetch_url_info(session, bookmark))
@@ -384,30 +424,30 @@ def process_uploaded_file(file):
384
 
385
  if file is None:
386
  logger.warning("No file uploaded")
387
- return "Please upload a bookmarks HTML file.", ''
388
 
389
  try:
390
  file_content = file.decode('utf-8')
391
  except UnicodeDecodeError as e:
392
  logger.error(f"Error decoding the file: {e}")
393
- return "Error decoding the file. Please ensure it's a valid HTML file.", ''
394
 
395
  try:
396
  bookmarks = parse_bookmarks(file_content)
397
  except Exception as e:
398
  logger.error(f"Error parsing bookmarks: {e}")
399
- return "Error parsing the bookmarks HTML file.", ''
400
 
401
  if not bookmarks:
402
  logger.warning("No bookmarks found in the uploaded file")
403
- return "No bookmarks found in the uploaded file.", ''
404
 
405
  # Asynchronously fetch bookmark info
406
  try:
407
  asyncio.run(process_bookmarks_async(bookmarks))
408
  except Exception as e:
409
  logger.error(f"Error processing bookmarks asynchronously: {e}")
410
- return "Error processing bookmarks.", ''
411
 
412
  # Generate summaries and assign categories
413
  for bookmark in bookmarks:
@@ -418,21 +458,17 @@ def process_uploaded_file(file):
418
  faiss_index, embeddings = vectorize_and_index(bookmarks)
419
  except Exception as e:
420
  logger.error(f"Error building FAISS index: {e}")
421
- return "Error building search index.", ''
422
 
423
  message = f"βœ… Successfully processed {len(bookmarks)} bookmarks."
424
  logger.info(message)
 
 
425
  bookmark_html = display_bookmarks()
426
-
427
- return message, bookmark_html
428
-
429
- def update_bookmark_selector():
430
- """
431
- Update the bookmark selector choices for the Manage Bookmarks tab.
432
- """
433
  choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
434
  for i, bookmark in enumerate(bookmarks)]
435
- return gr.update(choices=choices, value=[])
 
436
 
437
  def delete_selected_bookmarks(selected_indices):
438
  """
@@ -440,7 +476,7 @@ def delete_selected_bookmarks(selected_indices):
440
  """
441
  global bookmarks, faiss_index
442
  if not selected_indices:
443
- return "⚠️ No bookmarks selected.", gr.update(choices=[]), ''
444
 
445
  indices = [int(s.split('.')[0])-1 for s in selected_indices]
446
  indices = sorted(indices, reverse=True)
@@ -457,19 +493,20 @@ def delete_selected_bookmarks(selected_indices):
457
  message = "πŸ—‘οΈ Selected bookmarks deleted successfully."
458
  logger.info(message)
459
 
460
- bookmark_selector_update = update_bookmark_selector()
461
- bookmarks_html = display_bookmarks()
 
462
 
463
- return message, bookmark_selector_update, bookmarks_html
464
 
465
  def edit_selected_bookmarks_category(selected_indices, new_category):
466
  """
467
  Edit category of selected bookmarks.
468
  """
469
  if not selected_indices:
470
- return "⚠️ No bookmarks selected.", gr.update(choices=[]), ''
471
  if not new_category:
472
- return "⚠️ No new category selected.", gr.update(choices=[]), ''
473
 
474
  indices = [int(s.split('.')[0])-1 for s in selected_indices]
475
  for idx in indices:
@@ -480,10 +517,11 @@ def edit_selected_bookmarks_category(selected_indices, new_category):
480
  message = "✏️ Category updated for selected bookmarks."
481
  logger.info(message)
482
 
483
- bookmark_selector_update = update_bookmark_selector()
484
- bookmarks_html = display_bookmarks()
 
485
 
486
- return message, bookmark_selector_update, bookmarks_html
487
 
488
  def export_bookmarks():
489
  """
@@ -604,12 +642,6 @@ def build_app():
604
  output_text = gr.Textbox(label="βœ… Output", interactive=False)
605
  bookmark_display = gr.HTML(label="πŸ“„ Processed Bookmarks")
606
 
607
- process_button.click(
608
- process_uploaded_file,
609
- inputs=upload,
610
- outputs=[output_text, bookmark_display]
611
- )
612
-
613
  # Chat with Bookmarks Tab
614
  with gr.Tab("Chat with Bookmarks"):
615
  gr.Markdown("""
@@ -625,17 +657,10 @@ def build_app():
625
  chat_button = gr.Button("πŸ“¨ Send")
626
  chat_output = gr.Textbox(label="πŸ’¬ Response", interactive=False)
627
 
628
- chat_button.click(
629
- chatbot_response,
630
- inputs=user_input,
631
- outputs=chat_output
632
- )
633
-
634
  # Manage Bookmarks Tab
635
  with gr.Tab("Manage Bookmarks"):
636
  gr.Markdown("""
637
  ## πŸ› οΈ **Manage Bookmarks**
638
-
639
  Select bookmarks to delete or edit their categories.
640
  """)
641
 
@@ -649,31 +674,44 @@ def build_app():
649
  choices=CATEGORIES,
650
  value="Uncategorized"
651
  )
 
652
 
653
  with gr.Row():
654
  delete_button = gr.Button("πŸ—‘οΈ Delete Selected")
655
  edit_category_button = gr.Button("✏️ Edit Category")
656
  export_button = gr.Button("πŸ’Ύ Export")
657
-
658
- bookmark_display_manage = gr.HTML(label="πŸ“„ Bookmarks")
659
  download_link = gr.HTML(label="πŸ“₯ Download")
660
 
661
- delete_button.click(
662
- delete_selected_bookmarks,
663
- inputs=bookmark_selector,
664
- outputs=[manage_output, bookmark_selector, bookmark_display_manage]
665
- )
 
666
 
667
- edit_category_button.click(
668
- edit_selected_bookmarks_category,
669
- inputs=[bookmark_selector, new_category],
670
- outputs=[manage_output, bookmark_selector, bookmark_display_manage]
671
- )
672
 
673
- export_button.click(
674
- export_bookmarks,
675
- outputs=download_link
676
- )
 
 
 
 
 
 
 
 
 
 
 
 
677
 
678
  logger.info("Launching Gradio app")
679
  demo.launch(debug=True)
 
78
  """
79
  Extract the main content from a webpage while filtering out boilerplate content.
80
  """
81
+ if not soup:
82
+ return ""
83
+
84
  # Remove script and style elements
85
  for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
86
  element.decompose()
87
 
88
+ # First try to find content in main content areas
89
+ main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
90
  if main_content_tags:
91
  content = ' '.join([tag.get_text(strip=True, separator=' ') for tag in main_content_tags])
92
  else:
93
+ # Try to find content in <p> tags
94
+ p_tags = soup.find_all('p')
95
+ if p_tags:
96
+ content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
97
+ else:
98
+ # Fallback to body content
99
+ content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
100
 
101
  # Clean up the text
102
  content = ' '.join(content.split())
103
+ content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
104
+ content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
105
+
106
+ # Limit content length to avoid token limits (adjust as needed)
107
+ return content[:5000]
108
 
109
  def get_page_metadata(soup):
110
  """
 
116
  'keywords': ''
117
  }
118
 
119
+ if not soup:
120
+ return metadata
121
+
122
  # Get title
123
  title_tag = soup.find('title')
124
+ if title_tag and title_tag.string:
125
  metadata['title'] = title_tag.string.strip()
126
 
127
+ # Get meta description (try multiple variants)
128
+ meta_desc = (
129
+ soup.find('meta', attrs={'name': 'description'}) or
130
+ soup.find('meta', attrs={'property': 'og:description'}) or
131
+ soup.find('meta', attrs={'name': 'twitter:description'})
132
+ )
133
  if meta_desc:
134
  metadata['description'] = meta_desc.get('content', '').strip()
135
 
 
138
  if meta_keywords:
139
  metadata['keywords'] = meta_keywords.get('content', '').strip()
140
 
141
+ # Get OG title if main title is empty
142
+ if not metadata['title']:
143
+ og_title = soup.find('meta', attrs={'property': 'og:title'})
144
+ if og_title:
145
+ metadata['title'] = og_title.get('content', '').strip()
146
+
147
  return metadata
148
 
149
  def generate_summary(bookmark):
 
156
  # Get the HTML soup object from the bookmark if it exists
157
  soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
158
 
159
+ # Step 1: Extract all available information
160
  metadata = get_page_metadata(soup)
161
+ main_content = extract_main_content(soup)
 
 
 
162
 
163
+ # Step 2: Generate summary using LLM with all available content
164
+ try:
165
+ # Prepare comprehensive context for LLM
166
+ available_content = []
 
167
  if metadata['title']:
168
+ available_content.append(f"Title: {metadata['title']}")
169
+ if metadata['description']:
170
+ available_content.append(f"Description: {metadata['description']}")
171
+ if metadata['keywords']:
172
+ available_content.append(f"Keywords: {metadata['keywords']}")
173
+ if main_content:
174
+ available_content.append(f"Main Content: {main_content}")
175
 
176
+ if not available_content:
177
+ logger.warning("No content available for summary generation")
178
+ bookmark['summary'] = bookmark.get('title', 'No summary available.')
179
+ return bookmark
180
+
 
181
  prompt = f"""
182
+ Analyze and summarize this webpage based on the following information:
183
+
184
+ {' | '.join(available_content)}
185
 
186
+ Please provide a concise summary (2-3 sentences) focusing on:
187
+ 1. The main purpose or topic of the page
188
+ 2. Key information or features
189
+ 3. Target audience or use case (if apparent)
190
 
191
+ Be factual and objective.
 
192
  """
193
 
194
  response = openai.ChatCompletion.create(
 
208
 
209
  except Exception as e:
210
  logger.error(f"Error generating LLM summary: {e}")
211
+ # Fallback mechanisms in order of preference
212
+ if metadata['description']:
213
+ logger.info("Falling back to meta description")
214
+ bookmark['summary'] = metadata['description']
215
+ elif main_content:
216
+ logger.info("Falling back to truncated main content")
217
+ bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
218
+ elif metadata['title']:
219
+ logger.info("Falling back to title")
220
+ bookmark['summary'] = metadata['title']
221
+ else:
222
+ bookmark['summary'] = bookmark.get('title', 'No summary available.')
223
  return bookmark
224
 
225
  except Exception as e:
226
  logger.error(f"Error in generate_summary: {e}")
 
227
  bookmark['summary'] = bookmark.get('title', 'No summary available.')
228
  return bookmark
229
 
 
257
 
258
  try:
259
  logger.info(f"Fetching URL info for: {url}")
260
+ headers = {
261
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
262
+ }
263
+ async with session.get(url, timeout=10, headers=headers) as response:
264
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
265
  bookmark['status_code'] = response.status
266
 
267
  if response.status >= 400:
268
  bookmark['dead_link'] = True
269
  bookmark['description'] = ''
270
+ bookmark['html_content'] = ''
271
  logger.warning(f"Dead link detected: {url} with status {response.status}")
272
  else:
273
  bookmark['dead_link'] = False
274
  content = await response.text()
275
+ bookmark['html_content'] = content # Store full HTML for summary generation
276
  bookmark['description'] = '' # Will be set by generate_summary function
277
  logger.info(f"Fetched information for {url}")
278
  except Exception as e:
 
298
  """
299
  logger.info("Processing bookmarks asynchronously")
300
  try:
301
+ connector = aiohttp.TCPConnector(limit=5) # Limit concurrent connections
302
+ timeout = aiohttp.ClientTimeout(total=30) # Set timeout
303
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
304
  tasks = []
305
  for bookmark in bookmarks_list:
306
  task = asyncio.ensure_future(fetch_url_info(session, bookmark))
 
424
 
425
  if file is None:
426
  logger.warning("No file uploaded")
427
+ return "Please upload a bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
428
 
429
  try:
430
  file_content = file.decode('utf-8')
431
  except UnicodeDecodeError as e:
432
  logger.error(f"Error decoding the file: {e}")
433
+ return "Error decoding the file. Please ensure it's a valid HTML file.", '', gr.update(choices=[]), display_bookmarks()
434
 
435
  try:
436
  bookmarks = parse_bookmarks(file_content)
437
  except Exception as e:
438
  logger.error(f"Error parsing bookmarks: {e}")
439
+ return "Error parsing the bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
440
 
441
  if not bookmarks:
442
  logger.warning("No bookmarks found in the uploaded file")
443
+ return "No bookmarks found in the uploaded file.", '', gr.update(choices=[]), display_bookmarks()
444
 
445
  # Asynchronously fetch bookmark info
446
  try:
447
  asyncio.run(process_bookmarks_async(bookmarks))
448
  except Exception as e:
449
  logger.error(f"Error processing bookmarks asynchronously: {e}")
450
+ return "Error processing bookmarks.", '', gr.update(choices=[]), display_bookmarks()
451
 
452
  # Generate summaries and assign categories
453
  for bookmark in bookmarks:
 
458
  faiss_index, embeddings = vectorize_and_index(bookmarks)
459
  except Exception as e:
460
  logger.error(f"Error building FAISS index: {e}")
461
+ return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
462
 
463
  message = f"βœ… Successfully processed {len(bookmarks)} bookmarks."
464
  logger.info(message)
465
+
466
+ # Generate displays and updates
467
  bookmark_html = display_bookmarks()
 
 
 
 
 
 
 
468
  choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
469
  for i, bookmark in enumerate(bookmarks)]
470
+
471
+ return message, bookmark_html, gr.update(choices=choices), bookmark_html
472
 
473
  def delete_selected_bookmarks(selected_indices):
474
  """
 
476
  """
477
  global bookmarks, faiss_index
478
  if not selected_indices:
479
+ return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
480
 
481
  indices = [int(s.split('.')[0])-1 for s in selected_indices]
482
  indices = sorted(indices, reverse=True)
 
493
  message = "πŸ—‘οΈ Selected bookmarks deleted successfully."
494
  logger.info(message)
495
 
496
+ # Update choices and display
497
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
498
+ for i, bookmark in enumerate(bookmarks)]
499
 
500
+ return message, gr.update(choices=choices), display_bookmarks()
501
 
502
  def edit_selected_bookmarks_category(selected_indices, new_category):
503
  """
504
  Edit category of selected bookmarks.
505
  """
506
  if not selected_indices:
507
+ return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
508
  if not new_category:
509
+ return "⚠️ No new category selected.", gr.update(choices=[]), display_bookmarks()
510
 
511
  indices = [int(s.split('.')[0])-1 for s in selected_indices]
512
  for idx in indices:
 
517
  message = "✏️ Category updated for selected bookmarks."
518
  logger.info(message)
519
 
520
+ # Update choices and display
521
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
522
+ for i, bookmark in enumerate(bookmarks)]
523
 
524
+ return message, gr.update(choices=choices), display_bookmarks()
525
 
526
  def export_bookmarks():
527
  """
 
642
  output_text = gr.Textbox(label="βœ… Output", interactive=False)
643
  bookmark_display = gr.HTML(label="πŸ“„ Processed Bookmarks")
644
 
 
 
 
 
 
 
645
  # Chat with Bookmarks Tab
646
  with gr.Tab("Chat with Bookmarks"):
647
  gr.Markdown("""
 
657
  chat_button = gr.Button("πŸ“¨ Send")
658
  chat_output = gr.Textbox(label="πŸ’¬ Response", interactive=False)
659
 
 
 
 
 
 
 
660
  # Manage Bookmarks Tab
661
  with gr.Tab("Manage Bookmarks"):
662
  gr.Markdown("""
663
  ## πŸ› οΈ **Manage Bookmarks**
 
664
  Select bookmarks to delete or edit their categories.
665
  """)
666
 
 
674
  choices=CATEGORIES,
675
  value="Uncategorized"
676
  )
677
+ bookmark_display_manage = gr.HTML(label="πŸ“„ Bookmarks")
678
 
679
  with gr.Row():
680
  delete_button = gr.Button("πŸ—‘οΈ Delete Selected")
681
  edit_category_button = gr.Button("✏️ Edit Category")
682
  export_button = gr.Button("πŸ’Ύ Export")
683
+
 
684
  download_link = gr.HTML(label="πŸ“₯ Download")
685
 
686
+ # Set up event handlers
687
+ process_button.click(
688
+ process_uploaded_file,
689
+ inputs=upload,
690
+ outputs=[output_text, bookmark_display, bookmark_selector, bookmark_display_manage]
691
+ )
692
 
693
+ chat_button.click(
694
+ chatbot_response,
695
+ inputs=user_input,
696
+ outputs=chat_output
697
+ )
698
 
699
+ delete_button.click(
700
+ delete_selected_bookmarks,
701
+ inputs=bookmark_selector,
702
+ outputs=[manage_output, bookmark_selector, bookmark_display_manage]
703
+ )
704
+
705
+ edit_category_button.click(
706
+ edit_selected_bookmarks_category,
707
+ inputs=[bookmark_selector, new_category],
708
+ outputs=[manage_output, bookmark_selector, bookmark_display_manage]
709
+ )
710
+
711
+ export_button.click(
712
+ export_bookmarks,
713
+ outputs=download_link
714
+ )
715
 
716
  logger.info("Launching Gradio app")
717
  demo.launch(debug=True)