siddhartharya commited on
Commit
b8183dd
1 Parent(s): 3f6cb23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -39
app.py CHANGED
@@ -171,27 +171,39 @@ def generate_summary(bookmark):
171
  if main_content:
172
  available_content.append(f"Main Content: {main_content}")
173
 
174
- if not available_content:
175
- logger.warning("No content available for summary generation")
176
- bookmark['summary'] = bookmark.get('title', 'No summary available.')
177
- return bookmark
178
-
179
- # Estimate token count and trim content if necessary
180
- max_total_tokens = 8000 # Adjust based on model's maximum context length
181
- prompt_tokens_estimate = len(' '.join(available_content).split()) + 200 # 200 tokens reserved for response
182
- if prompt_tokens_estimate > max_total_tokens:
183
- # Trim main content
184
- allowable_content_tokens = max_total_tokens - 200 # Reserve 200 tokens for response
185
- main_content_tokens = len(main_content.split())
186
- if main_content_tokens > allowable_content_tokens:
187
- main_content = ' '.join(main_content.split()[:allowable_content_tokens])
188
- logger.info("Trimmed main content to fit within token limits.")
189
-
190
- # Update available content
191
- available_content[-1] = f"Main Content: {main_content}"
192
-
193
- # Construct the prompt
194
- prompt = f"""
 
 
 
 
 
 
 
 
 
 
 
 
195
  Analyze and summarize the following webpage content:
196
 
197
  {' '.join(available_content)}
@@ -221,7 +233,7 @@ Be factual and objective.
221
  return bookmark
222
 
223
  except Exception as e:
224
- logger.error(f"Error generating summary: {e}")
225
  # Fallback mechanisms
226
  if metadata['description']:
227
  logger.info("Falling back to meta description")
@@ -233,7 +245,37 @@ Be factual and objective.
233
  logger.info("Falling back to title")
234
  bookmark['summary'] = metadata['title']
235
  else:
236
- bookmark['summary'] = 'No summary available.'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  return bookmark
238
 
239
  def parse_bookmarks(file_content):
@@ -252,7 +294,7 @@ def parse_bookmarks(file_content):
252
  logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
253
  return extracted_bookmarks
254
  except Exception as e:
255
- logger.error("Error parsing bookmarks: %s", e)
256
  raise
257
 
258
  async def fetch_url_info(session, bookmark):
@@ -267,13 +309,38 @@ async def fetch_url_info(session, bookmark):
267
  try:
268
  logger.info(f"Fetching URL info for: {url}")
269
  headers = {
270
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
271
  }
272
- async with session.get(url, timeout=10, headers=headers) as response:
273
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
274
  bookmark['status_code'] = response.status
275
 
276
- if response.status >= 400:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  bookmark['dead_link'] = True
278
  bookmark['description'] = ''
279
  bookmark['html_content'] = ''
@@ -282,7 +349,7 @@ async def fetch_url_info(session, bookmark):
282
  bookmark['dead_link'] = False
283
  content = await response.text()
284
  bookmark['html_content'] = content # Store full HTML for summary generation
285
- bookmark['description'] = '' # Will be set by generate_summary function
286
  logger.info(f"Fetched information for {url}")
287
  except Exception as e:
288
  bookmark['dead_link'] = True
@@ -290,7 +357,7 @@ async def fetch_url_info(session, bookmark):
290
  bookmark['status_code'] = 'N/A'
291
  bookmark['description'] = ''
292
  bookmark['html_content'] = ''
293
- logger.error(f"Error fetching URL info for {url}: {e}")
294
  finally:
295
  fetch_cache[url] = {
296
  'etag': bookmark.get('etag'),
@@ -317,7 +384,7 @@ async def process_bookmarks_async(bookmarks_list):
317
  await asyncio.gather(*tasks)
318
  logger.info("Completed processing bookmarks asynchronously")
319
  except Exception as e:
320
- logger.error(f"Error in asynchronous processing of bookmarks: {e}")
321
  raise
322
 
323
  def assign_category(bookmark):
@@ -372,7 +439,7 @@ Respond with only the category name.
372
  return bookmark
373
 
374
  except Exception as e:
375
- logger.error(f"Error assigning category: {e}")
376
  bookmark['category'] = 'Uncategorized'
377
  return bookmark
378
 
@@ -392,7 +459,7 @@ def vectorize_and_index(bookmarks_list):
392
  logger.info("FAISS index built successfully with IDs")
393
  return index
394
  except Exception as e:
395
- logger.error(f"Error in vectorizing and indexing: {e}")
396
  raise
397
 
398
  def display_bookmarks():
@@ -453,13 +520,13 @@ def process_uploaded_file(file):
453
  try:
454
  file_content = file.decode('utf-8')
455
  except UnicodeDecodeError as e:
456
- logger.error(f"Error decoding the file: {e}")
457
  return "Error decoding the file. Please ensure it's a valid HTML file.", '', gr.update(choices=[]), display_bookmarks()
458
 
459
  try:
460
  bookmarks = parse_bookmarks(file_content)
461
  except Exception as e:
462
- logger.error(f"Error parsing bookmarks: {e}")
463
  return "Error parsing the bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
464
 
465
  if not bookmarks:
@@ -474,7 +541,7 @@ def process_uploaded_file(file):
474
  try:
475
  asyncio.run(process_bookmarks_async(bookmarks))
476
  except Exception as e:
477
- logger.error(f"Error processing bookmarks asynchronously: {e}")
478
  return "Error processing bookmarks.", '', gr.update(choices=[]), display_bookmarks()
479
 
480
  # Generate summaries and assign categories
@@ -485,7 +552,7 @@ def process_uploaded_file(file):
485
  try:
486
  faiss_index = vectorize_and_index(bookmarks)
487
  except Exception as e:
488
- logger.error(f"Error building FAISS index: {e}")
489
  return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
490
 
491
  message = f"✅ Successfully processed {len(bookmarks)} bookmarks."
@@ -580,7 +647,7 @@ def export_bookmarks():
580
  logger.info("Bookmarks exported successfully")
581
  return f'<a href="{href}" download="bookmarks.html">💾 Download Exported Bookmarks</a>'
582
  except Exception as e:
583
- logger.error(f"Error exporting bookmarks: {e}")
584
  return "⚠️ Error exporting bookmarks."
585
 
586
  def chatbot_response(user_query):
@@ -643,7 +710,7 @@ Provide a concise and helpful response.
643
 
644
  except Exception as e:
645
  error_message = f"⚠️ Error processing your query: {str(e)}"
646
- logger.error(error_message)
647
  return error_message
648
 
649
  def build_app():
@@ -760,7 +827,7 @@ def build_app():
760
  logger.info("Launching Gradio app")
761
  demo.launch(debug=True)
762
  except Exception as e:
763
- logger.error(f"Error building the app: {e}")
764
  print(f"Error building the app: {e}")
765
 
766
  if __name__ == "__main__":
 
171
  if main_content:
172
  available_content.append(f"Main Content: {main_content}")
173
 
174
+ # If content is insufficient, instruct the LLM to use prior knowledge
175
+ if not available_content or len(' '.join(available_content).split()) < 50:
176
+ prompt = f"""
177
+ You are a knowledgeable assistant.
178
+
179
+ The user provided a URL: {bookmark.get('url')}
180
+
181
+ Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
182
+
183
+ Focus on:
184
+ - The main purpose or topic of the website.
185
+ - Key information or features.
186
+ - Target audience or use case (if apparent).
187
+
188
+ Be factual and objective.
189
+ """
190
+ else:
191
+ # Estimate token count and trim content if necessary
192
+ max_total_tokens = 8000 # Adjust based on model's maximum context length
193
+ prompt_tokens_estimate = len(' '.join(available_content).split()) + 200 # 200 tokens reserved for response
194
+ if prompt_tokens_estimate > max_total_tokens:
195
+ # Trim main content
196
+ allowable_content_tokens = max_total_tokens - 200 # Reserve 200 tokens for response
197
+ main_content_tokens = len(main_content.split())
198
+ if main_content_tokens > allowable_content_tokens:
199
+ main_content = ' '.join(main_content.split()[:allowable_content_tokens])
200
+ logger.info("Trimmed main content to fit within token limits.")
201
+
202
+ # Update available content
203
+ available_content[-1] = f"Main Content: {main_content}"
204
+
205
+ # Construct the prompt
206
+ prompt = f"""
207
  Analyze and summarize the following webpage content:
208
 
209
  {' '.join(available_content)}
 
233
  return bookmark
234
 
235
  except Exception as e:
236
+ logger.error(f"Error generating summary: {e}", exc_info=True)
237
  # Fallback mechanisms
238
  if metadata['description']:
239
  logger.info("Falling back to meta description")
 
245
  logger.info("Falling back to title")
246
  bookmark['summary'] = metadata['title']
247
  else:
248
+ # If all else fails, prompt the LLM to use prior knowledge
249
+ prompt = f"""
250
+ You are a knowledgeable assistant.
251
+
252
+ The user provided a URL: {bookmark.get('url')}
253
+
254
+ Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
255
+
256
+ Focus on:
257
+ - The main purpose or topic of the website.
258
+ - Key information or features.
259
+ - Target audience or use case (if apparent).
260
+
261
+ Be factual and objective.
262
+ """
263
+ try:
264
+ response = openai.ChatCompletion.create(
265
+ model='llama3-8b-8192',
266
+ messages=[
267
+ {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
268
+ {"role": "user", "content": prompt}
269
+ ],
270
+ max_tokens=200,
271
+ temperature=0.5,
272
+ )
273
+ summary = response['choices'][0]['message']['content'].strip()
274
+ logger.info("Successfully generated LLM summary using prior knowledge")
275
+ bookmark['summary'] = summary
276
+ except Exception as e:
277
+ logger.error(f"Error generating summary using prior knowledge: {e}", exc_info=True)
278
+ bookmark['summary'] = 'No summary available.'
279
  return bookmark
280
 
281
  def parse_bookmarks(file_content):
 
294
  logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
295
  return extracted_bookmarks
296
  except Exception as e:
297
+ logger.error("Error parsing bookmarks: %s", e, exc_info=True)
298
  raise
299
 
300
  async def fetch_url_info(session, bookmark):
 
309
  try:
310
  logger.info(f"Fetching URL info for: {url}")
311
  headers = {
312
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
313
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
314
+ 'Chrome/91.0.4472.124 Safari/537.36',
315
+ 'Accept-Language': 'en-US,en;q=0.9',
316
  }
317
+ async with session.get(url, timeout=10, headers=headers, allow_redirects=True) as response:
318
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
319
  bookmark['status_code'] = response.status
320
 
321
+ if response.status >= 500:
322
+ # Server error, consider as dead link
323
+ bookmark['dead_link'] = True
324
+ bookmark['description'] = ''
325
+ bookmark['html_content'] = ''
326
+ logger.warning(f"Dead link detected: {url} with status {response.status}")
327
+ elif response.status == 403:
328
+ # Forbidden, but may be accessible with proper headers
329
+ logger.info(f"Received 403 for {url}, retrying with different headers")
330
+ # Try with different headers or methods if necessary
331
+ # For now, we'll proceed to read the content
332
+ content = await response.text()
333
+ bookmark['dead_link'] = False
334
+ bookmark['html_content'] = content
335
+ bookmark['description'] = ''
336
+ elif response.status == 400:
337
+ # Bad request, may be due to missing parameters
338
+ bookmark['dead_link'] = False
339
+ content = await response.text()
340
+ bookmark['html_content'] = content
341
+ bookmark['description'] = ''
342
+ elif response.status >= 400:
343
+ # Other client errors
344
  bookmark['dead_link'] = True
345
  bookmark['description'] = ''
346
  bookmark['html_content'] = ''
 
349
  bookmark['dead_link'] = False
350
  content = await response.text()
351
  bookmark['html_content'] = content # Store full HTML for summary generation
352
+ bookmark['description'] = ''
353
  logger.info(f"Fetched information for {url}")
354
  except Exception as e:
355
  bookmark['dead_link'] = True
 
357
  bookmark['status_code'] = 'N/A'
358
  bookmark['description'] = ''
359
  bookmark['html_content'] = ''
360
+ logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
361
  finally:
362
  fetch_cache[url] = {
363
  'etag': bookmark.get('etag'),
 
384
  await asyncio.gather(*tasks)
385
  logger.info("Completed processing bookmarks asynchronously")
386
  except Exception as e:
387
+ logger.error(f"Error in asynchronous processing of bookmarks: {e}", exc_info=True)
388
  raise
389
 
390
  def assign_category(bookmark):
 
439
  return bookmark
440
 
441
  except Exception as e:
442
+ logger.error(f"Error assigning category: {e}", exc_info=True)
443
  bookmark['category'] = 'Uncategorized'
444
  return bookmark
445
 
 
459
  logger.info("FAISS index built successfully with IDs")
460
  return index
461
  except Exception as e:
462
+ logger.error(f"Error in vectorizing and indexing: {e}", exc_info=True)
463
  raise
464
 
465
  def display_bookmarks():
 
520
  try:
521
  file_content = file.decode('utf-8')
522
  except UnicodeDecodeError as e:
523
+ logger.error(f"Error decoding the file: {e}", exc_info=True)
524
  return "Error decoding the file. Please ensure it's a valid HTML file.", '', gr.update(choices=[]), display_bookmarks()
525
 
526
  try:
527
  bookmarks = parse_bookmarks(file_content)
528
  except Exception as e:
529
+ logger.error(f"Error parsing bookmarks: {e}", exc_info=True)
530
  return "Error parsing the bookmarks HTML file.", '', gr.update(choices=[]), display_bookmarks()
531
 
532
  if not bookmarks:
 
541
  try:
542
  asyncio.run(process_bookmarks_async(bookmarks))
543
  except Exception as e:
544
+ logger.error(f"Error processing bookmarks asynchronously: {e}", exc_info=True)
545
  return "Error processing bookmarks.", '', gr.update(choices=[]), display_bookmarks()
546
 
547
  # Generate summaries and assign categories
 
552
  try:
553
  faiss_index = vectorize_and_index(bookmarks)
554
  except Exception as e:
555
+ logger.error(f"Error building FAISS index: {e}", exc_info=True)
556
  return "Error building search index.", '', gr.update(choices=[]), display_bookmarks()
557
 
558
  message = f"✅ Successfully processed {len(bookmarks)} bookmarks."
 
647
  logger.info("Bookmarks exported successfully")
648
  return f'<a href="{href}" download="bookmarks.html">💾 Download Exported Bookmarks</a>'
649
  except Exception as e:
650
+ logger.error(f"Error exporting bookmarks: {e}", exc_info=True)
651
  return "⚠️ Error exporting bookmarks."
652
 
653
  def chatbot_response(user_query):
 
710
 
711
  except Exception as e:
712
  error_message = f"⚠️ Error processing your query: {str(e)}"
713
+ logger.error(error_message, exc_info=True)
714
  return error_message
715
 
716
  def build_app():
 
827
  logger.info("Launching Gradio app")
828
  demo.launch(debug=True)
829
  except Exception as e:
830
+ logger.error(f"Error building the app: {e}", exc_info=True)
831
  print(f"Error building the app: {e}")
832
 
833
  if __name__ == "__main__":