Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -148,44 +148,48 @@ def generate_summary_and_assign_category(bookmark):
|
|
| 148 |
"""
|
| 149 |
logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
content_parts
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
You are a knowledgeable assistant with up-to-date information as of 2023.
|
| 190 |
|
| 191 |
The user provided a URL: {bookmark.get('url')}
|
|
@@ -201,9 +205,9 @@ Provide your response in the following format:
|
|
| 201 |
Summary: [Your summary here]
|
| 202 |
Category: [One of the categories]
|
| 203 |
"""
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
You are a helpful assistant that creates concise webpage summaries and assigns categories.
|
| 208 |
|
| 209 |
Analyze the following webpage content:
|
|
@@ -212,7 +216,7 @@ Analyze the following webpage content:
|
|
| 212 |
|
| 213 |
Please provide:
|
| 214 |
1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
|
| 215 |
-
2. Assign the most appropriate category from the list below for this webpage.
|
| 216 |
|
| 217 |
Categories:
|
| 218 |
{', '.join([f'"{cat}"' for cat in CATEGORIES])}
|
|
@@ -222,44 +226,59 @@ Summary: [Your summary here]
|
|
| 222 |
Category: [One of the categories]
|
| 223 |
"""
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
# Parse the response
|
| 239 |
-
summary_match = re.search(r"Summary:\s*(.*)", content)
|
| 240 |
-
category_match = re.search(r"Category:\s*(.*)", content)
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
bookmark['summary'] = 'No summary available.'
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
bookmark['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
else:
|
| 252 |
bookmark['category'] = 'Uncategorized'
|
| 253 |
-
else:
|
| 254 |
-
bookmark['category'] = 'Uncategorized'
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
def parse_bookmarks(file_content):
|
| 265 |
"""
|
|
|
|
| 148 |
"""
|
| 149 |
logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
|
| 150 |
|
| 151 |
+
max_retries = 3
|
| 152 |
+
retry_count = 0
|
| 153 |
+
|
| 154 |
+
while retry_count < max_retries:
|
| 155 |
+
try:
|
| 156 |
+
html_content = bookmark.get('html_content', '')
|
| 157 |
+
|
| 158 |
+
# Get the HTML soup object from the bookmark
|
| 159 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 160 |
+
|
| 161 |
+
# Extract metadata and main content
|
| 162 |
+
metadata = get_page_metadata(soup)
|
| 163 |
+
main_content = extract_main_content(soup)
|
| 164 |
+
|
| 165 |
+
# Prepare content for the prompt
|
| 166 |
+
content_parts = []
|
| 167 |
+
if metadata['title']:
|
| 168 |
+
content_parts.append(f"Title: {metadata['title']}")
|
| 169 |
+
if metadata['description']:
|
| 170 |
+
content_parts.append(f"Description: {metadata['description']}")
|
| 171 |
+
if metadata['keywords']:
|
| 172 |
+
content_parts.append(f"Keywords: {metadata['keywords']}")
|
| 173 |
+
if main_content:
|
| 174 |
+
content_parts.append(f"Main Content: {main_content}")
|
| 175 |
+
|
| 176 |
+
content_text = '\n'.join(content_parts)
|
| 177 |
+
|
| 178 |
+
# Detect insufficient or erroneous content
|
| 179 |
+
error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
|
| 180 |
+
if not content_text or len(content_text.split()) < 50:
|
| 181 |
+
use_prior_knowledge = True
|
| 182 |
+
logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
|
| 183 |
+
elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
|
| 184 |
+
use_prior_knowledge = True
|
| 185 |
+
logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
|
| 186 |
+
else:
|
| 187 |
+
use_prior_knowledge = False
|
| 188 |
|
| 189 |
+
# Prepare the prompt
|
| 190 |
+
if use_prior_knowledge:
|
| 191 |
+
# Construct prompt to use prior knowledge
|
| 192 |
+
prompt = f"""
|
| 193 |
You are a knowledgeable assistant with up-to-date information as of 2023.
|
| 194 |
|
| 195 |
The user provided a URL: {bookmark.get('url')}
|
|
|
|
| 205 |
Summary: [Your summary here]
|
| 206 |
Category: [One of the categories]
|
| 207 |
"""
|
| 208 |
+
else:
|
| 209 |
+
# Construct the prompt with the extracted content
|
| 210 |
+
prompt = f"""
|
| 211 |
You are a helpful assistant that creates concise webpage summaries and assigns categories.
|
| 212 |
|
| 213 |
Analyze the following webpage content:
|
|
|
|
| 216 |
|
| 217 |
Please provide:
|
| 218 |
1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
|
| 219 |
+
2. Assign the most appropriate category from the list below for this webpage. **Ensure the category directly reflects the content of the summary.**
|
| 220 |
|
| 221 |
Categories:
|
| 222 |
{', '.join([f'"{cat}"' for cat in CATEGORIES])}
|
|
|
|
| 226 |
Category: [One of the categories]
|
| 227 |
"""
|
| 228 |
|
| 229 |
+
# Call the LLM via Groq Cloud API
|
| 230 |
+
response = openai.ChatCompletion.create(
|
| 231 |
+
model='llama-3.1-70b-versatile',
|
| 232 |
+
messages=[
|
| 233 |
+
{"role": "user", "content": prompt}
|
| 234 |
+
],
|
| 235 |
+
max_tokens=200,
|
| 236 |
+
temperature=0.5,
|
| 237 |
+
)
|
| 238 |
+
content = response['choices'][0]['message']['content'].strip()
|
| 239 |
+
if not content:
|
| 240 |
+
raise ValueError("Empty response received from the model.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
# Parse the response
|
| 243 |
+
summary_match = re.search(r"Summary:\s*(.*)", content)
|
| 244 |
+
category_match = re.search(r"Category:\s*(.*)", content)
|
|
|
|
| 245 |
|
| 246 |
+
if summary_match:
|
| 247 |
+
bookmark['summary'] = summary_match.group(1).strip()
|
| 248 |
+
else:
|
| 249 |
+
bookmark['summary'] = 'No summary available.'
|
| 250 |
+
|
| 251 |
+
if category_match:
|
| 252 |
+
category = category_match.group(1).strip().strip('"')
|
| 253 |
+
if category in CATEGORIES:
|
| 254 |
+
bookmark['category'] = category
|
| 255 |
+
else:
|
| 256 |
+
bookmark['category'] = 'Uncategorized'
|
| 257 |
else:
|
| 258 |
bookmark['category'] = 'Uncategorized'
|
|
|
|
|
|
|
| 259 |
|
| 260 |
+
# Simple keyword-based validation (Optional)
|
| 261 |
+
summary_lower = bookmark['summary'].lower()
|
| 262 |
+
url_lower = bookmark['url'].lower()
|
| 263 |
+
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
| 264 |
+
bookmark['category'] = 'Social Media'
|
| 265 |
+
elif 'wikipedia' in url_lower:
|
| 266 |
+
bookmark['category'] = 'Reference and Knowledge Bases'
|
| 267 |
+
|
| 268 |
+
logger.info("Successfully generated summary and assigned category")
|
| 269 |
+
time.sleep(1) # Reduced sleep time
|
| 270 |
+
break # Exit the retry loop upon success
|
| 271 |
+
|
| 272 |
+
except openai.error.RateLimitError as e:
|
| 273 |
+
retry_count += 1
|
| 274 |
+
wait_time = int(e.headers.get("Retry-After", 5))
|
| 275 |
+
logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
|
| 276 |
+
time.sleep(wait_time)
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
|
| 279 |
+
bookmark['summary'] = 'No summary available.'
|
| 280 |
+
bookmark['category'] = 'Uncategorized'
|
| 281 |
+
break # Exit the retry loop on other exceptions
|
| 282 |
|
| 283 |
def parse_bookmarks(file_content):
|
| 284 |
"""
|