Spaces:
Running
Running
siddhartharya
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -148,44 +148,48 @@ def generate_summary_and_assign_category(bookmark):
|
|
148 |
"""
|
149 |
logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
content_parts
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
You are a knowledgeable assistant with up-to-date information as of 2023.
|
190 |
|
191 |
The user provided a URL: {bookmark.get('url')}
|
@@ -201,9 +205,9 @@ Provide your response in the following format:
|
|
201 |
Summary: [Your summary here]
|
202 |
Category: [One of the categories]
|
203 |
"""
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
You are a helpful assistant that creates concise webpage summaries and assigns categories.
|
208 |
|
209 |
Analyze the following webpage content:
|
@@ -212,7 +216,7 @@ Analyze the following webpage content:
|
|
212 |
|
213 |
Please provide:
|
214 |
1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
|
215 |
-
2. Assign the most appropriate category from the list below for this webpage.
|
216 |
|
217 |
Categories:
|
218 |
{', '.join([f'"{cat}"' for cat in CATEGORIES])}
|
@@ -222,44 +226,59 @@ Summary: [Your summary here]
|
|
222 |
Category: [One of the categories]
|
223 |
"""
|
224 |
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
# Parse the response
|
239 |
-
summary_match = re.search(r"Summary:\s*(.*)", content)
|
240 |
-
category_match = re.search(r"Category:\s*(.*)", content)
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
bookmark['summary'] = 'No summary available.'
|
246 |
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
bookmark['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
else:
|
252 |
bookmark['category'] = 'Uncategorized'
|
253 |
-
else:
|
254 |
-
bookmark['category'] = 'Uncategorized'
|
255 |
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
def parse_bookmarks(file_content):
|
265 |
"""
|
|
|
148 |
"""
|
149 |
logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
|
150 |
|
151 |
+
max_retries = 3
|
152 |
+
retry_count = 0
|
153 |
+
|
154 |
+
while retry_count < max_retries:
|
155 |
+
try:
|
156 |
+
html_content = bookmark.get('html_content', '')
|
157 |
+
|
158 |
+
# Get the HTML soup object from the bookmark
|
159 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
160 |
+
|
161 |
+
# Extract metadata and main content
|
162 |
+
metadata = get_page_metadata(soup)
|
163 |
+
main_content = extract_main_content(soup)
|
164 |
+
|
165 |
+
# Prepare content for the prompt
|
166 |
+
content_parts = []
|
167 |
+
if metadata['title']:
|
168 |
+
content_parts.append(f"Title: {metadata['title']}")
|
169 |
+
if metadata['description']:
|
170 |
+
content_parts.append(f"Description: {metadata['description']}")
|
171 |
+
if metadata['keywords']:
|
172 |
+
content_parts.append(f"Keywords: {metadata['keywords']}")
|
173 |
+
if main_content:
|
174 |
+
content_parts.append(f"Main Content: {main_content}")
|
175 |
+
|
176 |
+
content_text = '\n'.join(content_parts)
|
177 |
+
|
178 |
+
# Detect insufficient or erroneous content
|
179 |
+
error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
|
180 |
+
if not content_text or len(content_text.split()) < 50:
|
181 |
+
use_prior_knowledge = True
|
182 |
+
logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
|
183 |
+
elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
|
184 |
+
use_prior_knowledge = True
|
185 |
+
logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
|
186 |
+
else:
|
187 |
+
use_prior_knowledge = False
|
188 |
|
189 |
+
# Prepare the prompt
|
190 |
+
if use_prior_knowledge:
|
191 |
+
# Construct prompt to use prior knowledge
|
192 |
+
prompt = f"""
|
193 |
You are a knowledgeable assistant with up-to-date information as of 2023.
|
194 |
|
195 |
The user provided a URL: {bookmark.get('url')}
|
|
|
205 |
Summary: [Your summary here]
|
206 |
Category: [One of the categories]
|
207 |
"""
|
208 |
+
else:
|
209 |
+
# Construct the prompt with the extracted content
|
210 |
+
prompt = f"""
|
211 |
You are a helpful assistant that creates concise webpage summaries and assigns categories.
|
212 |
|
213 |
Analyze the following webpage content:
|
|
|
216 |
|
217 |
Please provide:
|
218 |
1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
|
219 |
+
2. Assign the most appropriate category from the list below for this webpage. **Ensure the category directly reflects the content of the summary.**
|
220 |
|
221 |
Categories:
|
222 |
{', '.join([f'"{cat}"' for cat in CATEGORIES])}
|
|
|
226 |
Category: [One of the categories]
|
227 |
"""
|
228 |
|
229 |
+
# Call the LLM via Groq Cloud API
|
230 |
+
response = openai.ChatCompletion.create(
|
231 |
+
model='llama-3.1-70b-versatile',
|
232 |
+
messages=[
|
233 |
+
{"role": "user", "content": prompt}
|
234 |
+
],
|
235 |
+
max_tokens=200,
|
236 |
+
temperature=0.5,
|
237 |
+
)
|
238 |
+
content = response['choices'][0]['message']['content'].strip()
|
239 |
+
if not content:
|
240 |
+
raise ValueError("Empty response received from the model.")
|
|
|
|
|
|
|
|
|
241 |
|
242 |
+
# Parse the response
|
243 |
+
summary_match = re.search(r"Summary:\s*(.*)", content)
|
244 |
+
category_match = re.search(r"Category:\s*(.*)", content)
|
|
|
245 |
|
246 |
+
if summary_match:
|
247 |
+
bookmark['summary'] = summary_match.group(1).strip()
|
248 |
+
else:
|
249 |
+
bookmark['summary'] = 'No summary available.'
|
250 |
+
|
251 |
+
if category_match:
|
252 |
+
category = category_match.group(1).strip().strip('"')
|
253 |
+
if category in CATEGORIES:
|
254 |
+
bookmark['category'] = category
|
255 |
+
else:
|
256 |
+
bookmark['category'] = 'Uncategorized'
|
257 |
else:
|
258 |
bookmark['category'] = 'Uncategorized'
|
|
|
|
|
259 |
|
260 |
+
# Simple keyword-based validation (Optional)
|
261 |
+
summary_lower = bookmark['summary'].lower()
|
262 |
+
url_lower = bookmark['url'].lower()
|
263 |
+
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
264 |
+
bookmark['category'] = 'Social Media'
|
265 |
+
elif 'wikipedia' in url_lower:
|
266 |
+
bookmark['category'] = 'Reference and Knowledge Bases'
|
267 |
+
|
268 |
+
logger.info("Successfully generated summary and assigned category")
|
269 |
+
time.sleep(1) # Reduced sleep time
|
270 |
+
break # Exit the retry loop upon success
|
271 |
+
|
272 |
+
except openai.error.RateLimitError as e:
|
273 |
+
retry_count += 1
|
274 |
+
wait_time = int(e.headers.get("Retry-After", 5))
|
275 |
+
logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
|
276 |
+
time.sleep(wait_time)
|
277 |
+
except Exception as e:
|
278 |
+
logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
|
279 |
+
bookmark['summary'] = 'No summary available.'
|
280 |
+
bookmark['category'] = 'Uncategorized'
|
281 |
+
break # Exit the retry loop on other exceptions
|
282 |
|
283 |
def parse_bookmarks(file_content):
|
284 |
"""
|