siddhartharya commited on
Commit
3b9dc5a
·
verified ·
1 Parent(s): 47ee377

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -74
app.py CHANGED
@@ -148,44 +148,48 @@ def generate_summary_and_assign_category(bookmark):
148
  """
149
  logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
150
 
151
- try:
152
- html_content = bookmark.get('html_content', '')
153
-
154
- # Get the HTML soup object from the bookmark
155
- soup = BeautifulSoup(html_content, 'html.parser')
156
-
157
- # Extract metadata and main content
158
- metadata = get_page_metadata(soup)
159
- main_content = extract_main_content(soup)
160
-
161
- # Prepare content for the prompt
162
- content_parts = []
163
- if metadata['title']:
164
- content_parts.append(f"Title: {metadata['title']}")
165
- if metadata['description']:
166
- content_parts.append(f"Description: {metadata['description']}")
167
- if metadata['keywords']:
168
- content_parts.append(f"Keywords: {metadata['keywords']}")
169
- if main_content:
170
- content_parts.append(f"Main Content: {main_content}")
171
-
172
- content_text = '\n'.join(content_parts)
173
-
174
- # Detect insufficient or erroneous content
175
- error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
176
- if not content_text or len(content_text.split()) < 50:
177
- use_prior_knowledge = True
178
- logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
179
- elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
180
- use_prior_knowledge = True
181
- logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
182
- else:
183
- use_prior_knowledge = False
 
 
 
 
184
 
185
- # Prepare the prompt
186
- if use_prior_knowledge:
187
- # Construct prompt to use prior knowledge
188
- prompt = f"""
189
  You are a knowledgeable assistant with up-to-date information as of 2023.
190
 
191
  The user provided a URL: {bookmark.get('url')}
@@ -201,9 +205,9 @@ Provide your response in the following format:
201
  Summary: [Your summary here]
202
  Category: [One of the categories]
203
  """
204
- else:
205
- # Construct the prompt with the extracted content
206
- prompt = f"""
207
  You are a helpful assistant that creates concise webpage summaries and assigns categories.
208
 
209
  Analyze the following webpage content:
@@ -212,7 +216,7 @@ Analyze the following webpage content:
212
 
213
  Please provide:
214
  1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
215
- 2. Assign the most appropriate category from the list below for this webpage.
216
 
217
  Categories:
218
  {', '.join([f'"{cat}"' for cat in CATEGORIES])}
@@ -222,44 +226,59 @@ Summary: [Your summary here]
222
  Category: [One of the categories]
223
  """
224
 
225
- # Call the LLM via Groq Cloud API
226
- response = openai.ChatCompletion.create(
227
- model='llama-3.1-70b-versatile',
228
- messages=[
229
- {"role": "user", "content": prompt}
230
- ],
231
- max_tokens=200,
232
- temperature=0.5,
233
- )
234
- content = response['choices'][0]['message']['content'].strip()
235
- if not content:
236
- raise ValueError("Empty response received from the model.")
237
-
238
- # Parse the response
239
- summary_match = re.search(r"Summary:\s*(.*)", content)
240
- category_match = re.search(r"Category:\s*(.*)", content)
241
 
242
- if summary_match:
243
- bookmark['summary'] = summary_match.group(1).strip()
244
- else:
245
- bookmark['summary'] = 'No summary available.'
246
 
247
- if category_match:
248
- category = category_match.group(1).strip().strip('"')
249
- if category in CATEGORIES:
250
- bookmark['category'] = category
 
 
 
 
 
 
 
251
  else:
252
  bookmark['category'] = 'Uncategorized'
253
- else:
254
- bookmark['category'] = 'Uncategorized'
255
 
256
- logger.info("Successfully generated summary and assigned category")
257
- time.sleep(1) # Reduced sleep time
258
-
259
- except Exception as e:
260
- logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
261
- bookmark['summary'] = 'No summary available.'
262
- bookmark['category'] = 'Uncategorized'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
  def parse_bookmarks(file_content):
265
  """
 
148
  """
149
  logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
150
 
151
+ max_retries = 3
152
+ retry_count = 0
153
+
154
+ while retry_count < max_retries:
155
+ try:
156
+ html_content = bookmark.get('html_content', '')
157
+
158
+ # Get the HTML soup object from the bookmark
159
+ soup = BeautifulSoup(html_content, 'html.parser')
160
+
161
+ # Extract metadata and main content
162
+ metadata = get_page_metadata(soup)
163
+ main_content = extract_main_content(soup)
164
+
165
+ # Prepare content for the prompt
166
+ content_parts = []
167
+ if metadata['title']:
168
+ content_parts.append(f"Title: {metadata['title']}")
169
+ if metadata['description']:
170
+ content_parts.append(f"Description: {metadata['description']}")
171
+ if metadata['keywords']:
172
+ content_parts.append(f"Keywords: {metadata['keywords']}")
173
+ if main_content:
174
+ content_parts.append(f"Main Content: {main_content}")
175
+
176
+ content_text = '\n'.join(content_parts)
177
+
178
+ # Detect insufficient or erroneous content
179
+ error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
180
+ if not content_text or len(content_text.split()) < 50:
181
+ use_prior_knowledge = True
182
+ logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
183
+ elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
184
+ use_prior_knowledge = True
185
+ logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
186
+ else:
187
+ use_prior_knowledge = False
188
 
189
+ # Prepare the prompt
190
+ if use_prior_knowledge:
191
+ # Construct prompt to use prior knowledge
192
+ prompt = f"""
193
  You are a knowledgeable assistant with up-to-date information as of 2023.
194
 
195
  The user provided a URL: {bookmark.get('url')}
 
205
  Summary: [Your summary here]
206
  Category: [One of the categories]
207
  """
208
+ else:
209
+ # Construct the prompt with the extracted content
210
+ prompt = f"""
211
  You are a helpful assistant that creates concise webpage summaries and assigns categories.
212
 
213
  Analyze the following webpage content:
 
216
 
217
  Please provide:
218
  1. A concise summary in **no more than two sentences** focusing on the main purpose or topic of the page and key information or features.
219
+ 2. Assign the most appropriate category from the list below for this webpage. **Ensure the category directly reflects the content of the summary.**
220
 
221
  Categories:
222
  {', '.join([f'"{cat}"' for cat in CATEGORIES])}
 
226
  Category: [One of the categories]
227
  """
228
 
229
+ # Call the LLM via Groq Cloud API
230
+ response = openai.ChatCompletion.create(
231
+ model='llama-3.1-70b-versatile',
232
+ messages=[
233
+ {"role": "user", "content": prompt}
234
+ ],
235
+ max_tokens=200,
236
+ temperature=0.5,
237
+ )
238
+ content = response['choices'][0]['message']['content'].strip()
239
+ if not content:
240
+ raise ValueError("Empty response received from the model.")
 
 
 
 
241
 
242
+ # Parse the response
243
+ summary_match = re.search(r"Summary:\s*(.*)", content)
244
+ category_match = re.search(r"Category:\s*(.*)", content)
 
245
 
246
+ if summary_match:
247
+ bookmark['summary'] = summary_match.group(1).strip()
248
+ else:
249
+ bookmark['summary'] = 'No summary available.'
250
+
251
+ if category_match:
252
+ category = category_match.group(1).strip().strip('"')
253
+ if category in CATEGORIES:
254
+ bookmark['category'] = category
255
+ else:
256
+ bookmark['category'] = 'Uncategorized'
257
  else:
258
  bookmark['category'] = 'Uncategorized'
 
 
259
 
260
+ # Simple keyword-based validation (Optional)
261
+ summary_lower = bookmark['summary'].lower()
262
+ url_lower = bookmark['url'].lower()
263
+ if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
264
+ bookmark['category'] = 'Social Media'
265
+ elif 'wikipedia' in url_lower:
266
+ bookmark['category'] = 'Reference and Knowledge Bases'
267
+
268
+ logger.info("Successfully generated summary and assigned category")
269
+ time.sleep(1) # Reduced sleep time
270
+ break # Exit the retry loop upon success
271
+
272
+ except openai.error.RateLimitError as e:
273
+ retry_count += 1
274
+ wait_time = int(e.headers.get("Retry-After", 5))
275
+ logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
276
+ time.sleep(wait_time)
277
+ except Exception as e:
278
+ logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
279
+ bookmark['summary'] = 'No summary available.'
280
+ bookmark['category'] = 'Uncategorized'
281
+ break # Exit the retry loop on other exceptions
282
 
283
  def parse_bookmarks(file_content):
284
  """