siddhartharya commited on
Commit
fb6f5e6
1 Parent(s): 85352fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -66
app.py CHANGED
@@ -81,29 +81,26 @@ def extract_main_content(soup):
81
  if not soup:
82
  return ""
83
 
84
- # Remove script and style elements
85
- for element in soup(['script', 'style', 'header', 'footer', 'nav', 'ads', 'sidebar']):
86
  element.decompose()
87
 
88
- # First try to find content in main content areas
89
- main_content_tags = soup.find_all(['article', 'main', 'div.content', 'div.post', 'div.entry-content'])
90
- if main_content_tags:
91
- content = ' '.join([tag.get_text(strip=True, separator=' ') for tag in main_content_tags])
92
  else:
93
- # Try to find content in <p> tags
94
- p_tags = soup.find_all('p')
95
- if p_tags:
96
- content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
97
- else:
98
- # Fallback to body content
99
- content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
100
 
101
  # Clean up the text
102
- content = ' '.join(content.split())
103
  content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
104
- content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
105
 
106
- # Return the content
 
 
 
 
107
  return content
108
 
109
  def get_page_metadata(soup):
@@ -124,7 +121,7 @@ def get_page_metadata(soup):
124
  if title_tag and title_tag.string:
125
  metadata['title'] = title_tag.string.strip()
126
 
127
- # Get meta description (try multiple variants)
128
  meta_desc = (
129
  soup.find('meta', attrs={'name': 'description'}) or
130
  soup.find('meta', attrs={'property': 'og:description'}) or
@@ -148,7 +145,7 @@ def get_page_metadata(soup):
148
 
149
  def generate_summary(bookmark):
150
  """
151
- Generate a comprehensive summary for a bookmark using available content and LLM via the Groq Cloud API.
152
  """
153
  logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
154
 
@@ -163,39 +160,64 @@ def generate_summary(bookmark):
163
  main_content = extract_main_content(soup)
164
 
165
  # Prepare content for the prompt
166
- available_content = []
167
  if metadata['title']:
168
- available_content.append(f"Title: {metadata['title']}")
169
  if metadata['description']:
170
- available_content.append(f"Description: {metadata['description']}")
171
  if metadata['keywords']:
172
- available_content.append(f"Keywords: {metadata['keywords']}")
173
  if main_content:
174
- available_content.append(f"Main Content: {main_content}")
175
 
176
- content_text = ' '.join(available_content)
177
 
178
- # Construct the prompt
179
- prompt = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  You are a helpful assistant that creates concise webpage summaries.
181
 
182
  Analyze the following webpage content:
183
 
184
  {content_text}
185
 
 
 
186
  Provide a concise summary (2-3 sentences) focusing on:
187
  - The main purpose or topic of the page.
188
  - Key information or features.
189
  - Target audience or use case (if apparent).
190
 
191
- If the content is insufficient, use your prior knowledge about the website.
192
-
193
  Be factual and objective.
194
  """
195
 
196
  # Call the LLM via Groq Cloud API
197
  response = openai.ChatCompletion.create(
198
- model='llama3-8b-8192', # Reverted back to the previous model
199
  messages=[
200
  {"role": "user", "content": prompt}
201
  ],
@@ -212,40 +234,7 @@ Be factual and objective.
212
 
213
  except Exception as e:
214
  logger.error(f"Error generating summary: {e}", exc_info=True)
215
- # Fallback to prior knowledge
216
- try:
217
- prompt = f"""
218
- You are a knowledgeable assistant.
219
-
220
- The user provided a URL: {bookmark.get('url')}
221
-
222
- Provide a concise summary (2-3 sentences) about this website based on your knowledge.
223
-
224
- Focus on:
225
- - The main purpose or topic of the website.
226
- - Key information or features.
227
- - Target audience or use case (if apparent).
228
-
229
- Be factual and objective.
230
- """
231
-
232
- response = openai.ChatCompletion.create(
233
- model='llama3-8b-8192', # Reverted back to the previous model
234
- messages=[
235
- {"role": "user", "content": prompt}
236
- ],
237
- max_tokens=200,
238
- temperature=0.5,
239
- )
240
-
241
- summary = response['choices'][0]['message']['content'].strip()
242
- if not summary:
243
- raise ValueError("Empty summary received from the model.")
244
- logger.info("Successfully generated LLM summary using prior knowledge")
245
- bookmark['summary'] = summary
246
- except Exception as inner_e:
247
- logger.error(f"Error generating summary using prior knowledge: {inner_e}", exc_info=True)
248
- bookmark['summary'] = 'No summary available.'
249
  return bookmark
250
 
251
  def parse_bookmarks(file_content):
@@ -284,12 +273,14 @@ async def fetch_url_info(session, bookmark):
284
  'Chrome/91.0.4472.124 Safari/537.36',
285
  'Accept-Language': 'en-US,en;q=0.9',
286
  }
287
- async with session.get(url, timeout=20, headers=headers, ssl=False) as response:
288
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
289
  bookmark['status_code'] = response.status
290
 
291
  content = await response.text()
 
292
 
 
293
  if response.status >= 500:
294
  # Server error, consider as dead link
295
  bookmark['dead_link'] = True
@@ -355,6 +346,8 @@ def assign_category(bookmark):
355
  # Prepare the prompt
356
  categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
357
  prompt = f"""
 
 
358
  Based on the following summary, assign the most appropriate category from the list below.
359
 
360
  Summary:
@@ -368,7 +361,7 @@ Respond with only the category name.
368
 
369
  try:
370
  response = openai.ChatCompletion.create(
371
- model='llama3-8b-8192', # Reverted back to the previous model
372
  messages=[
373
  {"role": "user", "content": prompt}
374
  ],
@@ -645,7 +638,7 @@ Provide a concise and helpful response.
645
  """
646
 
647
  response = openai.ChatCompletion.create(
648
- model='llama3-8b-8192', # Reverted back to the previous model
649
  messages=[
650
  {"role": "user", "content": prompt}
651
  ],
 
81
  if not soup:
82
  return ""
83
 
84
+ # Remove unwanted elements
85
+ for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
86
  element.decompose()
87
 
88
+ # Extract text from <p> tags
89
+ p_tags = soup.find_all('p')
90
+ if p_tags:
91
+ content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
92
  else:
93
+ # Fallback to body content
94
+ content = soup.get_text(separator=' ', strip=True)
 
 
 
 
 
95
 
96
  # Clean up the text
 
97
  content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
 
98
 
99
+ # Truncate content to a reasonable length (e.g., 1500 words)
100
+ words = content.split()
101
+ if len(words) > 1500:
102
+ content = ' '.join(words[:1500])
103
+
104
  return content
105
 
106
  def get_page_metadata(soup):
 
121
  if title_tag and title_tag.string:
122
  metadata['title'] = title_tag.string.strip()
123
 
124
+ # Get meta description
125
  meta_desc = (
126
  soup.find('meta', attrs={'name': 'description'}) or
127
  soup.find('meta', attrs={'property': 'og:description'}) or
 
145
 
146
  def generate_summary(bookmark):
147
  """
148
+ Generate a concise summary for a bookmark using available content and LLM via the Groq Cloud API.
149
  """
150
  logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
151
 
 
160
  main_content = extract_main_content(soup)
161
 
162
  # Prepare content for the prompt
163
+ content_parts = []
164
  if metadata['title']:
165
+ content_parts.append(f"Title: {metadata['title']}")
166
  if metadata['description']:
167
+ content_parts.append(f"Description: {metadata['description']}")
168
  if metadata['keywords']:
169
+ content_parts.append(f"Keywords: {metadata['keywords']}")
170
  if main_content:
171
+ content_parts.append(f"Main Content: {main_content}")
172
 
173
+ content_text = '\n'.join(content_parts)
174
 
175
+ # Detect insufficient or erroneous content
176
+ error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic', 'Page Not Found', '404 Not Found', 'Forbidden']
177
+ if not content_text or len(content_text.split()) < 50 or any(keyword.lower() in content_text.lower() for keyword in error_keywords):
178
+ use_prior_knowledge = True
179
+ logger.info(f"Content for {bookmark.get('url')} is insufficient or contains error messages. Instructing LLM to use prior knowledge.")
180
+ else:
181
+ use_prior_knowledge = False
182
+
183
+ if use_prior_knowledge:
184
+ # Construct prompt to use prior knowledge
185
+ prompt = f"""
186
+ You are a knowledgeable assistant.
187
+
188
+ The user provided a URL: {bookmark.get('url')}
189
+
190
+ Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
191
+
192
+ Focus on:
193
+ - The main purpose or topic of the website.
194
+ - Key information or features.
195
+ - Target audience or use case (if apparent).
196
+
197
+ Be factual and objective.
198
+ """
199
+ else:
200
+ # Construct the prompt with the extracted content
201
+ prompt = f"""
202
  You are a helpful assistant that creates concise webpage summaries.
203
 
204
  Analyze the following webpage content:
205
 
206
  {content_text}
207
 
208
+ If the content is insufficient or seems to be an error page, please use your own knowledge to provide an accurate summary.
209
+
210
  Provide a concise summary (2-3 sentences) focusing on:
211
  - The main purpose or topic of the page.
212
  - Key information or features.
213
  - Target audience or use case (if apparent).
214
 
 
 
215
  Be factual and objective.
216
  """
217
 
218
  # Call the LLM via Groq Cloud API
219
  response = openai.ChatCompletion.create(
220
+ model='llama-3.1-70b-versatile',
221
  messages=[
222
  {"role": "user", "content": prompt}
223
  ],
 
234
 
235
  except Exception as e:
236
  logger.error(f"Error generating summary: {e}", exc_info=True)
237
+ bookmark['summary'] = 'No summary available.'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  return bookmark
239
 
240
  def parse_bookmarks(file_content):
 
273
  'Chrome/91.0.4472.124 Safari/537.36',
274
  'Accept-Language': 'en-US,en;q=0.9',
275
  }
276
+ async with session.get(url, timeout=20, headers=headers, ssl=False, allow_redirects=True) as response:
277
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
278
  bookmark['status_code'] = response.status
279
 
280
  content = await response.text()
281
+ logger.info(f"Fetched content length for {url}: {len(content)} characters")
282
 
283
+ # Handle status codes
284
  if response.status >= 500:
285
  # Server error, consider as dead link
286
  bookmark['dead_link'] = True
 
346
  # Prepare the prompt
347
  categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
348
  prompt = f"""
349
+ You are a helpful assistant that categorizes webpages.
350
+
351
  Based on the following summary, assign the most appropriate category from the list below.
352
 
353
  Summary:
 
361
 
362
  try:
363
  response = openai.ChatCompletion.create(
364
+ model='llama-3.1-70b-versatile',
365
  messages=[
366
  {"role": "user", "content": prompt}
367
  ],
 
638
  """
639
 
640
  response = openai.ChatCompletion.create(
641
+ model='llama-3.1-70b-versatile',
642
  messages=[
643
  {"role": "user", "content": prompt}
644
  ],