Spaces:

siddhartharya
/

Bookmark-Manager

Running

App Files Files Community

siddhartharya commited on Nov 26, 2024

Commit

7391b3d

verified ·

1 Parent(s): f4e6753

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -241

app.py CHANGED Viewed

@@ -8,13 +8,13 @@ import numpy as np
 import requests
 import time
 import re
 import logging
 import os
 import sys
-import threading
-from queue import Queue, Empty
-import json
 from concurrent.futures import ThreadPoolExecutor
 # Import OpenAI library
 import openai
@@ -74,91 +74,18 @@ CATEGORIES = [
     "Uncategorized",
 ]
-# Set up OpenAI API key and base URL
-OPENAI_API_KEY = os.getenv('GROQ_API_KEY')  # Ensure this environment variable is set correctly
-if not OPENAI_API_KEY:
     logger.error("GROQ_API_KEY environment variable not set.")
-openai.api_key = OPENAI_API_KEY
-openai.api_base = "https://api.groq.com/openai/v1"  # Ensure this is the correct base URL for your API
-# Rate Limiter Configuration
-RPM_LIMIT = 60       # Requests per minute (adjust based on your API's limit)
-TPM_LIMIT = 60000    # Tokens per minute (adjust based on your API's limit)
-BATCH_SIZE = 5       # Number of bookmarks per batch
-# Implementing a Token Bucket Rate Limiter
-class TokenBucket:
-    def __init__(self, rate, capacity):
-        self.rate = rate  # tokens per second
-        self.capacity = capacity
-        self.tokens = capacity
-        self.timestamp = time.time()
-        self.lock = threading.Lock()
-    def consume(self, tokens=1):
-        with self.lock:
-            now = time.time()
-            elapsed = now - self.timestamp
-            refill = elapsed * self.rate
-            self.tokens = min(self.capacity, self.tokens + refill)
-            self.timestamp = now
-            if self.tokens >= tokens:
-                self.tokens -= tokens
-                return True
-            else:
-                return False
-    def wait_for_token(self, tokens=1):
-        while not self.consume(tokens):
-            time.sleep(0.05)
-# Initialize rate limiters
-rpm_rate = RPM_LIMIT / 60  # tokens per second
-tpm_rate = TPM_LIMIT / 60  # tokens per second
-rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
-tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
-# Queue for LLM tasks
-llm_queue = Queue()
-def categorize_based_on_summary(summary, url):
-    """
-    Assign category based on keywords in the summary or URL.
-    """
-    summary_lower = summary.lower()
-    url_lower = url.lower()
-    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
-        return 'Social Media'
-    elif 'wikipedia' in url_lower:
-        return 'Reference and Knowledge Bases'
-    elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
-        return 'Technology'
-    elif 'news' in summary_lower or 'media' in summary_lower:
-        return 'News and Media'
-    elif 'education' in summary_lower or 'learning' in summary_lower:
-        return 'Education and Learning'
-    # Add more conditions as needed
-    else:
-        return 'Uncategorized'
-def validate_category(bookmark):
-    """
-    Further validate and adjust the category if needed.
-    """
-    # Example: Specific cases based on URL
-    url_lower = bookmark['url'].lower()
-    if 'facebook' in url_lower or 'x.com' in url_lower:
-        return 'Social Media'
-    elif 'wikipedia' in url_lower:
-        return 'Reference and Knowledge Bases'
-    elif 'aws.amazon.com' in url_lower:
-        return 'Technology'
-    # Add more specific cases as needed
-    else:
-        return bookmark['category']
 def extract_main_content(soup):
     """
@@ -229,140 +156,149 @@ def get_page_metadata(soup):
     return metadata
-def llm_worker():
     """
-    Worker thread to process LLM tasks from the queue while respecting rate limits.
     """
-    logger.info("LLM worker started.")
-    while True:
-        batch = []
         try:
-            # Collect bookmarks up to BATCH_SIZE
-            while len(batch) < BATCH_SIZE:
-                bookmark = llm_queue.get(timeout=1)
-                if bookmark is None:
-                    # Shutdown signal
-                    logger.info("LLM worker shutting down.")
-                    return
-                if not bookmark.get('dead_link') and not bookmark.get('slow_link'):
-                    batch.append(bookmark)
-                else:
-                    # Skip processing for dead or slow links
-                    bookmark['summary'] = 'No summary available.'
-                    bookmark['category'] = 'Uncategorized'
-                    llm_queue.task_done()
-        except Empty:
-            pass  # No more bookmarks at the moment
-        if batch:
-            try:
-                # Rate Limiting
-                rpm_bucket.wait_for_token()
-                # Estimate tokens: prompt + max_tokens
-                # Here, we assume max_tokens=150 per bookmark
-                total_tokens = 150 * len(batch)
-                tpm_bucket.wait_for_token(tokens=total_tokens)
-                # Prepare prompt
-                prompt = "You are an assistant that creates concise webpage summaries and assigns categories.\n\n"
-                prompt += "Provide summaries and categories for the following bookmarks:\n\n"
-                for idx, bookmark in enumerate(batch, 1):
-                    prompt += f"Bookmark {idx}:\nURL: {bookmark['url']}\nTitle: {bookmark['title']}\n\n"
-                # Corrected f-string without backslashes
-                prompt += f"Categories:\n{', '.join([f'\"{cat}\"' for cat in CATEGORIES])}\n\n"
-                prompt += "Format your response as a JSON object where each key is the bookmark URL and the value is another JSON object containing 'summary' and 'category'.\n\n"
-                prompt += "Example:\n"
-                prompt += "{\n"
-                prompt += "  \"https://example.com\": {\n"
-                prompt += "    \"summary\": \"This is an example summary.\",\n"
-                prompt += "    \"category\": \"Technology\"\n"
-                prompt += "  }\n"
-                prompt += "}\n\n"
-                prompt += "Now, provide the summaries and categories for the bookmarks listed above."
-                response = openai.ChatCompletion.create(
-                    model='llama-3.1-70b-versatile',  # Ensure this model is correct and available
-                    messages=[
-                        {"role": "user", "content": prompt}
-                    ],
-                    max_tokens=150 * len(batch),
-                    temperature=0.5,
-                )
-                content = response['choices'][0]['message']['content'].strip()
-                if not content:
-                    raise ValueError("Empty response received from the model.")
-                # Parse JSON response
-                try:
-                    json_response = json.loads(content)
-                    for bookmark in batch:
-                        url = bookmark['url']
-                        if url in json_response:
-                            summary = json_response[url].get('summary', '').strip()
-                            category = json_response[url].get('category', '').strip()
-                            if not summary:
-                                summary = 'No summary available.'
-                            bookmark['summary'] = summary
-                            if category in CATEGORIES:
-                                bookmark['category'] = category
-                            else:
-                                # Fallback to keyword-based categorization
-                                bookmark['category'] = categorize_based_on_summary(summary, url)
-                        else:
-                            logger.warning(f"No data returned for {url}. Using fallback methods.")
-                            bookmark['summary'] = 'No summary available.'
-                            bookmark['category'] = 'Uncategorized'
-                        # Additional keyword-based validation
-                        bookmark['category'] = validate_category(bookmark)
-                        logger.info(f"Processed bookmark: {url}")
-                except json.JSONDecodeError:
-                    logger.error("Failed to parse JSON response from LLM. Using fallback methods.")
-                    for bookmark in batch:
-                        bookmark['summary'] = 'No summary available.'
-                        bookmark['category'] = categorize_based_on_summary(bookmark.get('summary', ''), bookmark['url'])
-                        bookmark['category'] = validate_category(bookmark)
-                except Exception as e:
-                    logger.error(f"Error processing LLM response: {e}", exc_info=True)
-                    for bookmark in batch:
-                        bookmark['summary'] = 'No summary available.'
-                        bookmark['category'] = 'Uncategorized'
-            except openai.error.RateLimitError as e:
-                logger.warning(f"LLM Rate limit reached. Retrying after 60 seconds.")
-                # Re-enqueue the entire batch for retry
-                for bookmark in batch:
-                    llm_queue.put(bookmark)
-                time.sleep(60)  # Wait before retrying
-                continue  # Skip the rest and retry
-            except Exception as e:
-                logger.error(f"Error during LLM processing: {e}", exc_info=True)
-                for bookmark in batch:
-                    bookmark['summary'] = 'No summary available.'
-                    bookmark['category'] = 'Uncategorized'
-            finally:
-                # Mark all bookmarks in the batch as done
-                for _ in batch:
-                    llm_queue.task_done()
-def generate_summary_and_assign_category(bookmark):
-    """
-    Enqueue bookmarks for LLM processing.
-    """
-    logger.info(f"Enqueuing bookmark for LLM processing: {bookmark.get('url')}")
-    llm_queue.put(bookmark)
 def parse_bookmarks(file_content):
     """
@@ -411,17 +347,15 @@ def fetch_url_info(bookmark):
         if response.status_code >= 500:
             bookmark['dead_link'] = True
-            bookmark['html_content'] = ''
             bookmark['description'] = ''
             logger.warning(f"Dead link detected: {url} with status {response.status_code}")
         else:
             bookmark['dead_link'] = False
             bookmark['html_content'] = content
-            # Extract description from metadata
-            soup = BeautifulSoup(content, 'html.parser')
-            metadata = get_page_metadata(soup)
-            bookmark['description'] = metadata.get('description', '')
             logger.info(f"Fetched information for {url}")
     except requests.exceptions.Timeout:
         bookmark['dead_link'] = False
         bookmark['etag'] = 'N/A'
@@ -555,14 +489,10 @@ def process_uploaded_file(file, state_bookmarks):
     with ThreadPoolExecutor(max_workers=10) as executor:
         executor.map(fetch_url_info, bookmarks)
-    # Enqueue bookmarks for LLM processing
-    logger.info("Enqueuing bookmarks for LLM processing")
-    for bookmark in bookmarks:
-        generate_summary_and_assign_category(bookmark)
-    # Wait until all LLM tasks are completed
-    llm_queue.join()
-    logger.info("All LLM tasks have been processed")
     try:
         faiss_index = vectorize_and_index(bookmarks)
@@ -689,11 +619,15 @@ def chatbot_response(user_query, chat_history):
     try:
         chat_history.append({"role": "user", "content": user_query})
-        # Rate Limiting
-        rpm_bucket.wait_for_token()
-        # Estimate tokens: prompt + max_tokens
-        # Here, we assume max_tokens=300 per chatbot response
-        tpm_bucket.wait_for_token(tokens=300)
         query_vector = embedding_model.encode([user_query]).astype('float32')
         k = 5
@@ -701,8 +635,7 @@ def chatbot_response(user_query, chat_history):
         ids = ids.flatten()
         id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
-        # Filter out bookmarks without summaries
-        matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark and id_to_bookmark.get(id).get('summary')]
         if not matching_bookmarks:
             answer = "No relevant bookmarks found for your query."
@@ -722,17 +655,30 @@ Bookmarks:
 Provide a concise and helpful response.
 """
         response = openai.ChatCompletion.create(
-            model='llama-3.1-70b-versatile',  # Ensure this model is correct and available
             messages=[
                 {"role": "user", "content": prompt}
             ],
-            max_tokens=300,
             temperature=0.7,
         )
         answer = response['choices'][0]['message']['content'].strip()
         logger.info("Chatbot response generated")
         chat_history.append({"role": "assistant", "content": answer})
         return chat_history
@@ -863,7 +809,7 @@ Navigate through the tabs to explore each feature in detail.
 """)
                 manage_output = gr.Textbox(label="🔄 Status", interactive=False)
                 # CheckboxGroup for selecting bookmarks
                 bookmark_selector = gr.CheckboxGroup(
                     label="✅ Select Bookmarks",
@@ -924,12 +870,8 @@ Navigate through the tabs to explore each feature in detail.
         logger.info("Launching Gradio app")
         demo.launch(debug=True)
     except Exception as e:
-        logger.error(f"Error building Gradio app: {e}", exc_info=True)
-        print(f"Error building Gradio app: {e}")
 if __name__ == "__main__":
-    # Start the LLM worker thread before launching the app
-    llm_thread = threading.Thread(target=llm_worker, daemon=True)
-    llm_thread.start()
     build_app()

 import requests
 import time
 import re
+import base64
 import logging
 import os
 import sys
+import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
+import threading
 # Import OpenAI library
 import openai
     "Uncategorized",
 ]
+# Set up Groq Cloud API key and base URL
+GROQ_API_KEY = os.getenv('GROQ_API_KEY')
+if not GROQ_API_KEY:
     logger.error("GROQ_API_KEY environment variable not set.")
+openai.api_key = GROQ_API_KEY
+openai.api_base = "https://api.groq.com/openai/v1"
+# Initialize global variables for rate limiting
+api_lock = threading.Lock()
+last_api_call_time = 0
 def extract_main_content(soup):
     """
     return metadata
+def generate_summary_and_assign_category(bookmark):
     """
+    Generate a concise summary and assign a category using a single LLM call.
     """
+    logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
+    max_retries = 3
+    retry_count = 0
+    while retry_count < max_retries:
         try:
+            # Rate Limiting Logic
+            with api_lock:
+                global last_api_call_time
+                current_time = time.time()
+                elapsed = current_time - last_api_call_time
+                if elapsed < 2:
+                    sleep_duration = 2 - elapsed
+                    logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
+                    time.sleep(sleep_duration)
+                last_api_call_time = time.time()
+            html_content = bookmark.get('html_content', '')
+            soup = BeautifulSoup(html_content, 'html.parser')
+            metadata = get_page_metadata(soup)
+            main_content = extract_main_content(soup)
+            # Prepare content for the prompt
+            content_parts = []
+            if metadata['title']:
+                content_parts.append(f"Title: {metadata['title']}")
+            if metadata['description']:
+                content_parts.append(f"Description: {metadata['description']}")
+            if metadata['keywords']:
+                content_parts.append(f"Keywords: {metadata['keywords']}")
+            if main_content:
+                content_parts.append(f"Main Content: {main_content}")
+            content_text = '\n'.join(content_parts)
+            # Detect insufficient or erroneous content
+            error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
+            if not content_text or len(content_text.split()) < 50:
+                use_prior_knowledge = True
+                logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
+            elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
+                use_prior_knowledge = True
+                logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
+            else:
+                use_prior_knowledge = False
+            if use_prior_knowledge:
+                prompt = f"""
+You are a knowledgeable assistant with up-to-date information as of 2023.
+URL: {bookmark.get('url')}
+Provide:
+1. A concise summary (max two sentences) about this website.
+2. Assign the most appropriate category from the list below.
+Categories:
+{', '.join([f'"{cat}"' for cat in CATEGORIES])}
+Format:
+Summary: [Your summary]
+Category: [One category]
+"""
+            else:
+                prompt = f"""
+You are an assistant that creates concise webpage summaries and assigns categories.
+Content:
+{content_text}
+Provide:
+1. A concise summary (max two sentences) focusing on the main topic.
+2. Assign the most appropriate category from the list below.
+Categories:
+{', '.join([f'"{cat}"' for cat in CATEGORIES])}
+Format:
+Summary: [Your summary]
+Category: [One category]
+"""
+            def estimate_tokens(text):
+                return len(text) / 4
+            prompt_tokens = estimate_tokens(prompt)
+            max_tokens = 150
+            total_tokens = prompt_tokens + max_tokens
+            tokens_per_minute = 40000
+            tokens_per_second = tokens_per_minute / 60
+            required_delay = total_tokens / tokens_per_second
+            sleep_time = max(required_delay, 2)
+            response = openai.ChatCompletion.create(
+                model='llama-3.1-70b-versatile',
+                messages=[
+                    {"role": "user", "content": prompt}
+                ],
+                max_tokens=int(max_tokens),
+                temperature=0.5,
+            )
+            content = response['choices'][0]['message']['content'].strip()
+            if not content:
+                raise ValueError("Empty response received from the model.")
+            summary_match = re.search(r"Summary:\s*(.*)", content)
+            category_match = re.search(r"Category:\s*(.*)", content)
+            if summary_match:
+                bookmark['summary'] = summary_match.group(1).strip()
+            else:
+                bookmark['summary'] = 'No summary available.'
+            if category_match:
+                category = category_match.group(1).strip().strip('"')
+                if category in CATEGORIES:
+                    bookmark['category'] = category
+                else:
+                    bookmark['category'] = 'Uncategorized'
+            else:
+                bookmark['category'] = 'Uncategorized'
+            # Simple keyword-based validation
+            summary_lower = bookmark['summary'].lower()
+            url_lower = bookmark['url'].lower()
+            if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
+                bookmark['category'] = 'Social Media'
+            elif 'wikipedia' in url_lower:
+                bookmark['category'] = 'Reference and Knowledge Bases'
+            logger.info("Successfully generated summary and assigned category")
+            time.sleep(sleep_time)
+            break
+        except openai.error.RateLimitError as e:
+            retry_count += 1
+            wait_time = int(e.headers.get("Retry-After", 5))
+            logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
+            time.sleep(wait_time)
+        except Exception as e:
+            logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
+            bookmark['summary'] = 'No summary available.'
+            bookmark['category'] = 'Uncategorized'
+            break
 def parse_bookmarks(file_content):
     """
         if response.status_code >= 500:
             bookmark['dead_link'] = True
             bookmark['description'] = ''
+            bookmark['html_content'] = ''
             logger.warning(f"Dead link detected: {url} with status {response.status_code}")
         else:
             bookmark['dead_link'] = False
             bookmark['html_content'] = content
+            bookmark['description'] = ''
             logger.info(f"Fetched information for {url}")
     except requests.exceptions.Timeout:
         bookmark['dead_link'] = False
         bookmark['etag'] = 'N/A'
     with ThreadPoolExecutor(max_workers=10) as executor:
         executor.map(fetch_url_info, bookmarks)
+    # Process bookmarks concurrently with LLM calls
+    logger.info("Processing bookmarks with LLM concurrently")
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        executor.map(generate_summary_and_assign_category, bookmarks)
     try:
         faiss_index = vectorize_and_index(bookmarks)
     try:
         chat_history.append({"role": "user", "content": user_query})
+        with api_lock:
+            global last_api_call_time
+            current_time = time.time()
+            elapsed = current_time - last_api_call_time
+            if elapsed < 2:
+                sleep_duration = 2 - elapsed
+                logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
+                time.sleep(sleep_duration)
+            last_api_call_time = time.time()
         query_vector = embedding_model.encode([user_query]).astype('float32')
         k = 5
         ids = ids.flatten()
         id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
+        matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
         if not matching_bookmarks:
             answer = "No relevant bookmarks found for your query."
 Provide a concise and helpful response.
 """
+        def estimate_tokens(text):
+            return len(text) / 4
+        prompt_tokens = estimate_tokens(prompt)
+        max_tokens = 300
+        total_tokens = prompt_tokens + max_tokens
+        tokens_per_minute = 40000
+        tokens_per_second = tokens_per_minute / 60
+        required_delay = total_tokens / tokens_per_second
+        sleep_time = max(required_delay, 2)
         response = openai.ChatCompletion.create(
+            model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],
+            max_tokens=int(max_tokens),
             temperature=0.7,
         )
         answer = response['choices'][0]['message']['content'].strip()
         logger.info("Chatbot response generated")
+        time.sleep(sleep_time)
         chat_history.append({"role": "assistant", "content": answer})
         return chat_history
 """)
                 manage_output = gr.Textbox(label="🔄 Status", interactive=False)
                 # CheckboxGroup for selecting bookmarks
                 bookmark_selector = gr.CheckboxGroup(
                     label="✅ Select Bookmarks",
         logger.info("Launching Gradio app")
         demo.launch(debug=True)
     except Exception as e:
+        logger.error(f"Error building the app: {e}", exc_info=True)
+        print(f"Error building the app: {e}")
 if __name__ == "__main__":
     build_app()