Spaces:

siddhartharya
/

Bookmark-Manager

Running

App Files Files Community

siddhartharya commited on Nov 25, 2024

Commit

fb6f5e6

verified ·

1 Parent(s): 85352fd

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -66

app.py CHANGED Viewed

@@ -81,29 +81,26 @@ def extract_main_content(soup):
     if not soup:
         return ""
-    # Remove script and style elements
-    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
         element.decompose()
-    # First try to find content in main content areas
-    main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
-    if main_content_tags:
-        content = ' '.join([tag.get_text(strip=True, separator=' ') for tag in main_content_tags])
     else:
-        # Try to find content in <p> tags
-        p_tags = soup.find_all('p')
-        if p_tags:
-            content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
-        else:
-            # Fallback to body content
-            content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
     # Clean up the text
-    content = ' '.join(content.split())
     content = re.sub(r'\s+', ' ', content)  # Remove multiple spaces
-    content = re.sub(r'[\n\r\t]', ' ', content)  # Remove newlines and tabs
-    # Return the content
     return content
 def get_page_metadata(soup):
@@ -124,7 +121,7 @@ def get_page_metadata(soup):
     if title_tag and title_tag.string:
         metadata['title'] = title_tag.string.strip()
-    # Get meta description (try multiple variants)
     meta_desc = (
         soup.find('meta', attrs={'name': 'description'}) or
         soup.find('meta', attrs={'property': 'og:description'}) or
@@ -148,7 +145,7 @@ def get_page_metadata(soup):
 def generate_summary(bookmark):
     """
-    Generate a comprehensive summary for a bookmark using available content and LLM via the Groq Cloud API.
     """
     logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
@@ -163,39 +160,64 @@ def generate_summary(bookmark):
         main_content = extract_main_content(soup)
         # Prepare content for the prompt
-        available_content = []
         if metadata['title']:
-            available_content.append(f"Title: {metadata['title']}")
         if metadata['description']:
-            available_content.append(f"Description: {metadata['description']}")
         if metadata['keywords']:
-            available_content.append(f"Keywords: {metadata['keywords']}")
         if main_content:
-            available_content.append(f"Main Content: {main_content}")
-        content_text = ' '.join(available_content)
-        # Construct the prompt
-        prompt = f"""
 You are a helpful assistant that creates concise webpage summaries.
 Analyze the following webpage content:
 {content_text}
 Provide a concise summary (2-3 sentences) focusing on:
 - The main purpose or topic of the page.
 - Key information or features.
 - Target audience or use case (if apparent).
-If the content is insufficient, use your prior knowledge about the website.
 Be factual and objective.
 """
         # Call the LLM via Groq Cloud API
         response = openai.ChatCompletion.create(
-            model='llama3-8b-8192',  # Reverted back to the previous model
             messages=[
                 {"role": "user", "content": prompt}
             ],
@@ -212,40 +234,7 @@ Be factual and objective.
     except Exception as e:
         logger.error(f"Error generating summary: {e}", exc_info=True)
-        # Fallback to prior knowledge
-        try:
-            prompt = f"""
-You are a knowledgeable assistant.
-The user provided a URL: {bookmark.get('url')}
-Provide a concise summary (2-3 sentences) about this website based on your knowledge.
-Focus on:
-- The main purpose or topic of the website.
-- Key information or features.
-- Target audience or use case (if apparent).
-Be factual and objective.
-"""
-            response = openai.ChatCompletion.create(
-                model='llama3-8b-8192',  # Reverted back to the previous model
-                messages=[
-                    {"role": "user", "content": prompt}
-                ],
-                max_tokens=200,
-                temperature=0.5,
-            )
-            summary = response['choices'][0]['message']['content'].strip()
-            if not summary:
-                raise ValueError("Empty summary received from the model.")
-            logger.info("Successfully generated LLM summary using prior knowledge")
-            bookmark['summary'] = summary
-        except Exception as inner_e:
-            logger.error(f"Error generating summary using prior knowledge: {inner_e}", exc_info=True)
-            bookmark['summary'] = 'No summary available.'
         return bookmark
 def parse_bookmarks(file_content):
@@ -284,12 +273,14 @@ async def fetch_url_info(session, bookmark):
                           'Chrome/91.0.4472.124 Safari/537.36',
             'Accept-Language': 'en-US,en;q=0.9',
         }
-        async with session.get(url, timeout=20, headers=headers, ssl=False) as response:
             bookmark['etag'] = response.headers.get('ETag', 'N/A')
             bookmark['status_code'] = response.status
             content = await response.text()
             if response.status >= 500:
                 # Server error, consider as dead link
                 bookmark['dead_link'] = True
@@ -355,6 +346,8 @@ def assign_category(bookmark):
     # Prepare the prompt
     categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
     prompt = f"""
 Based on the following summary, assign the most appropriate category from the list below.
 Summary:
@@ -368,7 +361,7 @@ Respond with only the category name.
     try:
         response = openai.ChatCompletion.create(
-            model='llama3-8b-8192',  # Reverted back to the previous model
             messages=[
                 {"role": "user", "content": prompt}
             ],
@@ -645,7 +638,7 @@ Provide a concise and helpful response.
 """
         response = openai.ChatCompletion.create(
-            model='llama3-8b-8192',  # Reverted back to the previous model
             messages=[
                 {"role": "user", "content": prompt}
             ],

     if not soup:
         return ""
+    # Remove unwanted elements
+    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
         element.decompose()
+    # Extract text from <p> tags
+    p_tags = soup.find_all('p')
+    if p_tags:
+        content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
     else:
+        # Fallback to body content
+        content = soup.get_text(separator=' ', strip=True)
     # Clean up the text
     content = re.sub(r'\s+', ' ', content)  # Remove multiple spaces
+    # Truncate content to a reasonable length (e.g., 1500 words)
+    words = content.split()
+    if len(words) > 1500:
+        content = ' '.join(words[:1500])
     return content
 def get_page_metadata(soup):
     if title_tag and title_tag.string:
         metadata['title'] = title_tag.string.strip()
+    # Get meta description
     meta_desc = (
         soup.find('meta', attrs={'name': 'description'}) or
         soup.find('meta', attrs={'property': 'og:description'}) or
 def generate_summary(bookmark):
     """
+    Generate a concise summary for a bookmark using available content and LLM via the Groq Cloud API.
     """
     logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
         main_content = extract_main_content(soup)
         # Prepare content for the prompt
+        content_parts = []
         if metadata['title']:
+            content_parts.append(f"Title: {metadata['title']}")
         if metadata['description']:
+            content_parts.append(f"Description: {metadata['description']}")
         if metadata['keywords']:
+            content_parts.append(f"Keywords: {metadata['keywords']}")
         if main_content:
+            content_parts.append(f"Main Content: {main_content}")
+        content_text = '\n'.join(content_parts)
+        # Detect insufficient or erroneous content
+        error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic', 'Page Not Found', '404 Not Found', 'Forbidden']
+        if not content_text or len(content_text.split()) < 50 or any(keyword.lower() in content_text.lower() for keyword in error_keywords):
+            use_prior_knowledge = True
+            logger.info(f"Content for {bookmark.get('url')} is insufficient or contains error messages. Instructing LLM to use prior knowledge.")
+        else:
+            use_prior_knowledge = False
+        if use_prior_knowledge:
+            # Construct prompt to use prior knowledge
+            prompt = f"""
+You are a knowledgeable assistant.
+The user provided a URL: {bookmark.get('url')}
+Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
+Focus on:
+- The main purpose or topic of the website.
+- Key information or features.
+- Target audience or use case (if apparent).
+Be factual and objective.
+"""
+        else:
+            # Construct the prompt with the extracted content
+            prompt = f"""
 You are a helpful assistant that creates concise webpage summaries.
 Analyze the following webpage content:
 {content_text}
+If the content is insufficient or seems to be an error page, please use your own knowledge to provide an accurate summary.
 Provide a concise summary (2-3 sentences) focusing on:
 - The main purpose or topic of the page.
 - Key information or features.
 - Target audience or use case (if apparent).
 Be factual and objective.
 """
         # Call the LLM via Groq Cloud API
         response = openai.ChatCompletion.create(
+            model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],
     except Exception as e:
         logger.error(f"Error generating summary: {e}", exc_info=True)
+        bookmark['summary'] = 'No summary available.'
         return bookmark
 def parse_bookmarks(file_content):
                           'Chrome/91.0.4472.124 Safari/537.36',
             'Accept-Language': 'en-US,en;q=0.9',
         }
+        async with session.get(url, timeout=20, headers=headers, ssl=False, allow_redirects=True) as response:
             bookmark['etag'] = response.headers.get('ETag', 'N/A')
             bookmark['status_code'] = response.status
             content = await response.text()
+            logger.info(f"Fetched content length for {url}: {len(content)} characters")
+            # Handle status codes
             if response.status >= 500:
                 # Server error, consider as dead link
                 bookmark['dead_link'] = True
     # Prepare the prompt
     categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
     prompt = f"""
+You are a helpful assistant that categorizes webpages.
 Based on the following summary, assign the most appropriate category from the list below.
 Summary:
     try:
         response = openai.ChatCompletion.create(
+            model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],
 """
         response = openai.ChatCompletion.create(
+            model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],