Spaces:

siddhartharya
/

Bookmark-Manager

Running

App Files Files Community

siddhartharya commited on Nov 25, 2024

Commit

3b9dc5a

verified ·

1 Parent(s): 47ee377

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -74

app.py CHANGED Viewed

@@ -148,44 +148,48 @@ def generate_summary_and_assign_category(bookmark):
     """
     logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
-    try:
-        html_content = bookmark.get('html_content', '')
-        # Get the HTML soup object from the bookmark
-        soup = BeautifulSoup(html_content, 'html.parser')
-        # Extract metadata and main content
-        metadata = get_page_metadata(soup)
-        main_content = extract_main_content(soup)
-        # Prepare content for the prompt
-        content_parts = []
-        if metadata['title']:
-            content_parts.append(f"Title: {metadata['title']}")
-        if metadata['description']:
-            content_parts.append(f"Description: {metadata['description']}")
-        if metadata['keywords']:
-            content_parts.append(f"Keywords: {metadata['keywords']}")
-        if main_content:
-            content_parts.append(f"Main Content: {main_content}")
-        content_text = '\n'.join(content_parts)
-        # Detect insufficient or erroneous content
-        error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
-        if not content_text or len(content_text.split()) < 50:
-            use_prior_knowledge = True
-            logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
-        elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
-            use_prior_knowledge = True
-            logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
-        else:
-            use_prior_knowledge = False
-        # Prepare the prompt
-        if use_prior_knowledge:
-            # Construct prompt to use prior knowledge
-            prompt = f"""
 You are a knowledgeable assistant with up-to-date information as of 2023.
 The user provided a URL: {bookmark.get('url')}
@@ -201,9 +205,9 @@ Provide your response in the following format:
 Summary: [Your summary here]
 Category: [One of the categories]
 """
-        else:
-            # Construct the prompt with the extracted content
-            prompt = f"""
 You are a helpful assistant that creates concise webpage summaries and assigns categories.
 Analyze the following webpage content:
@@ -212,7 +216,7 @@ Analyze the following webpage content:
 Please provide:
 1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
-2. Assign the most appropriate category from the list below for this webpage.
 Categories:
 {', '.join([f'"{cat}"' for cat in CATEGORIES])}
@@ -222,44 +226,59 @@ Summary: [Your summary here]
 Category: [One of the categories]
 """
-        # Call the LLM via Groq Cloud API
-        response = openai.ChatCompletion.create(
-            model='llama-3.1-70b-versatile',
-            messages=[
-                {"role": "user", "content": prompt}
-            ],
-            max_tokens=200,
-            temperature=0.5,
-        )
-        content = response['choices'][0]['message']['content'].strip()
-        if not content:
-            raise ValueError("Empty response received from the model.")
-        # Parse the response
-        summary_match = re.search(r"Summary:\s*(.*)", content)
-        category_match = re.search(r"Category:\s*(.*)", content)
-        if summary_match:
-            bookmark['summary'] = summary_match.group(1).strip()
-        else:
-            bookmark['summary'] = 'No summary available.'
-        if category_match:
-            category = category_match.group(1).strip().strip('"')
-            if category in CATEGORIES:
-                bookmark['category'] = category
             else:
                 bookmark['category'] = 'Uncategorized'
-        else:
-            bookmark['category'] = 'Uncategorized'
-        logger.info("Successfully generated summary and assigned category")
-        time.sleep(1)  # Reduced sleep time
-    except Exception as e:
-        logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
-        bookmark['summary'] = 'No summary available.'
-        bookmark['category'] = 'Uncategorized'
 def parse_bookmarks(file_content):
     """

     """
     logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
+    max_retries = 3
+    retry_count = 0
+    while retry_count < max_retries:
+        try:
+            html_content = bookmark.get('html_content', '')
+            # Get the HTML soup object from the bookmark
+            soup = BeautifulSoup(html_content, 'html.parser')
+            # Extract metadata and main content
+            metadata = get_page_metadata(soup)
+            main_content = extract_main_content(soup)
+            # Prepare content for the prompt
+            content_parts = []
+            if metadata['title']:
+                content_parts.append(f"Title: {metadata['title']}")
+            if metadata['description']:
+                content_parts.append(f"Description: {metadata['description']}")
+            if metadata['keywords']:
+                content_parts.append(f"Keywords: {metadata['keywords']}")
+            if main_content:
+                content_parts.append(f"Main Content: {main_content}")
+            content_text = '\n'.join(content_parts)
+            # Detect insufficient or erroneous content
+            error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
+            if not content_text or len(content_text.split()) < 50:
+                use_prior_knowledge = True
+                logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
+            elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
+                use_prior_knowledge = True
+                logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
+            else:
+                use_prior_knowledge = False
+            # Prepare the prompt
+            if use_prior_knowledge:
+                # Construct prompt to use prior knowledge
+                prompt = f"""
 You are a knowledgeable assistant with up-to-date information as of 2023.
 The user provided a URL: {bookmark.get('url')}
 Summary: [Your summary here]
 Category: [One of the categories]
 """
+            else:
+                # Construct the prompt with the extracted content
+                prompt = f"""
 You are a helpful assistant that creates concise webpage summaries and assigns categories.
 Analyze the following webpage content:
 Please provide:
 1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
+2. Assign the most appropriate category from the list below for this webpage. **Ensure the category directly reflects the content of the summary.**
 Categories:
 {', '.join([f'"{cat}"' for cat in CATEGORIES])}
 Category: [One of the categories]
 """
+            # Call the LLM via Groq Cloud API
+            response = openai.ChatCompletion.create(
+                model='llama-3.1-70b-versatile',
+                messages=[
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=200,
+                temperature=0.5,
+            )
+            content = response['choices'][0]['message']['content'].strip()
+            if not content:
+                raise ValueError("Empty response received from the model.")
+            # Parse the response
+            summary_match = re.search(r"Summary:\s*(.*)", content)
+            category_match = re.search(r"Category:\s*(.*)", content)
+            if summary_match:
+                bookmark['summary'] = summary_match.group(1).strip()
+            else:
+                bookmark['summary'] = 'No summary available.'
+            if category_match:
+                category = category_match.group(1).strip().strip('"')
+                if category in CATEGORIES:
+                    bookmark['category'] = category
+                else:
+                    bookmark['category'] = 'Uncategorized'
             else:
                 bookmark['category'] = 'Uncategorized'
+            # Simple keyword-based validation (Optional)
+            summary_lower = bookmark['summary'].lower()
+            url_lower = bookmark['url'].lower()
+            if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
+                bookmark['category'] = 'Social Media'
+            elif 'wikipedia' in url_lower:
+                bookmark['category'] = 'Reference and Knowledge Bases'
+            logger.info("Successfully generated summary and assigned category")
+            time.sleep(1)  # Reduced sleep time
+            break  # Exit the retry loop upon success
+        except openai.error.RateLimitError as e:
+            retry_count += 1
+            wait_time = int(e.headers.get("Retry-After", 5))
+            logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
+            time.sleep(wait_time)
+        except Exception as e:
+            logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
+            bookmark['summary'] = 'No summary available.'
+            bookmark['category'] = 'Uncategorized'
+            break  # Exit the retry loop on other exceptions
 def parse_bookmarks(file_content):
     """