Spaces:
Sleeping
Sleeping
siddhartharya
commited on
Commit
•
fb6f5e6
1
Parent(s):
85352fd
Update app.py
Browse files
app.py
CHANGED
@@ -81,29 +81,26 @@ def extract_main_content(soup):
|
|
81 |
if not soup:
|
82 |
return ""
|
83 |
|
84 |
-
# Remove
|
85 |
-
for element in soup(['script', 'style', 'header', 'footer', 'nav', '
|
86 |
element.decompose()
|
87 |
|
88 |
-
#
|
89 |
-
|
90 |
-
if
|
91 |
-
content = ' '.join([
|
92 |
else:
|
93 |
-
#
|
94 |
-
|
95 |
-
if p_tags:
|
96 |
-
content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
|
97 |
-
else:
|
98 |
-
# Fallback to body content
|
99 |
-
content = soup.body.get_text(strip=True, separator=' ') if soup.body else soup.get_text(strip=True, separator=' ')
|
100 |
|
101 |
# Clean up the text
|
102 |
-
content = ' '.join(content.split())
|
103 |
content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
|
104 |
-
content = re.sub(r'[\n\r\t]', ' ', content) # Remove newlines and tabs
|
105 |
|
106 |
-
#
|
|
|
|
|
|
|
|
|
107 |
return content
|
108 |
|
109 |
def get_page_metadata(soup):
|
@@ -124,7 +121,7 @@ def get_page_metadata(soup):
|
|
124 |
if title_tag and title_tag.string:
|
125 |
metadata['title'] = title_tag.string.strip()
|
126 |
|
127 |
-
# Get meta description
|
128 |
meta_desc = (
|
129 |
soup.find('meta', attrs={'name': 'description'}) or
|
130 |
soup.find('meta', attrs={'property': 'og:description'}) or
|
@@ -148,7 +145,7 @@ def get_page_metadata(soup):
|
|
148 |
|
149 |
def generate_summary(bookmark):
|
150 |
"""
|
151 |
-
Generate a
|
152 |
"""
|
153 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
154 |
|
@@ -163,39 +160,64 @@ def generate_summary(bookmark):
|
|
163 |
main_content = extract_main_content(soup)
|
164 |
|
165 |
# Prepare content for the prompt
|
166 |
-
|
167 |
if metadata['title']:
|
168 |
-
|
169 |
if metadata['description']:
|
170 |
-
|
171 |
if metadata['keywords']:
|
172 |
-
|
173 |
if main_content:
|
174 |
-
|
175 |
|
176 |
-
content_text = '
|
177 |
|
178 |
-
#
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
You are a helpful assistant that creates concise webpage summaries.
|
181 |
|
182 |
Analyze the following webpage content:
|
183 |
|
184 |
{content_text}
|
185 |
|
|
|
|
|
186 |
Provide a concise summary (2-3 sentences) focusing on:
|
187 |
- The main purpose or topic of the page.
|
188 |
- Key information or features.
|
189 |
- Target audience or use case (if apparent).
|
190 |
|
191 |
-
If the content is insufficient, use your prior knowledge about the website.
|
192 |
-
|
193 |
Be factual and objective.
|
194 |
"""
|
195 |
|
196 |
# Call the LLM via Groq Cloud API
|
197 |
response = openai.ChatCompletion.create(
|
198 |
-
model='
|
199 |
messages=[
|
200 |
{"role": "user", "content": prompt}
|
201 |
],
|
@@ -212,40 +234,7 @@ Be factual and objective.
|
|
212 |
|
213 |
except Exception as e:
|
214 |
logger.error(f"Error generating summary: {e}", exc_info=True)
|
215 |
-
|
216 |
-
try:
|
217 |
-
prompt = f"""
|
218 |
-
You are a knowledgeable assistant.
|
219 |
-
|
220 |
-
The user provided a URL: {bookmark.get('url')}
|
221 |
-
|
222 |
-
Provide a concise summary (2-3 sentences) about this website based on your knowledge.
|
223 |
-
|
224 |
-
Focus on:
|
225 |
-
- The main purpose or topic of the website.
|
226 |
-
- Key information or features.
|
227 |
-
- Target audience or use case (if apparent).
|
228 |
-
|
229 |
-
Be factual and objective.
|
230 |
-
"""
|
231 |
-
|
232 |
-
response = openai.ChatCompletion.create(
|
233 |
-
model='llama3-8b-8192', # Reverted back to the previous model
|
234 |
-
messages=[
|
235 |
-
{"role": "user", "content": prompt}
|
236 |
-
],
|
237 |
-
max_tokens=200,
|
238 |
-
temperature=0.5,
|
239 |
-
)
|
240 |
-
|
241 |
-
summary = response['choices'][0]['message']['content'].strip()
|
242 |
-
if not summary:
|
243 |
-
raise ValueError("Empty summary received from the model.")
|
244 |
-
logger.info("Successfully generated LLM summary using prior knowledge")
|
245 |
-
bookmark['summary'] = summary
|
246 |
-
except Exception as inner_e:
|
247 |
-
logger.error(f"Error generating summary using prior knowledge: {inner_e}", exc_info=True)
|
248 |
-
bookmark['summary'] = 'No summary available.'
|
249 |
return bookmark
|
250 |
|
251 |
def parse_bookmarks(file_content):
|
@@ -284,12 +273,14 @@ async def fetch_url_info(session, bookmark):
|
|
284 |
'Chrome/91.0.4472.124 Safari/537.36',
|
285 |
'Accept-Language': 'en-US,en;q=0.9',
|
286 |
}
|
287 |
-
async with session.get(url, timeout=20, headers=headers, ssl=False) as response:
|
288 |
bookmark['etag'] = response.headers.get('ETag', 'N/A')
|
289 |
bookmark['status_code'] = response.status
|
290 |
|
291 |
content = await response.text()
|
|
|
292 |
|
|
|
293 |
if response.status >= 500:
|
294 |
# Server error, consider as dead link
|
295 |
bookmark['dead_link'] = True
|
@@ -355,6 +346,8 @@ def assign_category(bookmark):
|
|
355 |
# Prepare the prompt
|
356 |
categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
|
357 |
prompt = f"""
|
|
|
|
|
358 |
Based on the following summary, assign the most appropriate category from the list below.
|
359 |
|
360 |
Summary:
|
@@ -368,7 +361,7 @@ Respond with only the category name.
|
|
368 |
|
369 |
try:
|
370 |
response = openai.ChatCompletion.create(
|
371 |
-
model='
|
372 |
messages=[
|
373 |
{"role": "user", "content": prompt}
|
374 |
],
|
@@ -645,7 +638,7 @@ Provide a concise and helpful response.
|
|
645 |
"""
|
646 |
|
647 |
response = openai.ChatCompletion.create(
|
648 |
-
model='
|
649 |
messages=[
|
650 |
{"role": "user", "content": prompt}
|
651 |
],
|
|
|
81 |
if not soup:
|
82 |
return ""
|
83 |
|
84 |
+
# Remove unwanted elements
|
85 |
+
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
|
86 |
element.decompose()
|
87 |
|
88 |
+
# Extract text from <p> tags
|
89 |
+
p_tags = soup.find_all('p')
|
90 |
+
if p_tags:
|
91 |
+
content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
|
92 |
else:
|
93 |
+
# Fallback to body content
|
94 |
+
content = soup.get_text(separator=' ', strip=True)
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
# Clean up the text
|
|
|
97 |
content = re.sub(r'\s+', ' ', content) # Remove multiple spaces
|
|
|
98 |
|
99 |
+
# Truncate content to a reasonable length (e.g., 1500 words)
|
100 |
+
words = content.split()
|
101 |
+
if len(words) > 1500:
|
102 |
+
content = ' '.join(words[:1500])
|
103 |
+
|
104 |
return content
|
105 |
|
106 |
def get_page_metadata(soup):
|
|
|
121 |
if title_tag and title_tag.string:
|
122 |
metadata['title'] = title_tag.string.strip()
|
123 |
|
124 |
+
# Get meta description
|
125 |
meta_desc = (
|
126 |
soup.find('meta', attrs={'name': 'description'}) or
|
127 |
soup.find('meta', attrs={'property': 'og:description'}) or
|
|
|
145 |
|
146 |
def generate_summary(bookmark):
|
147 |
"""
|
148 |
+
Generate a concise summary for a bookmark using available content and LLM via the Groq Cloud API.
|
149 |
"""
|
150 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
151 |
|
|
|
160 |
main_content = extract_main_content(soup)
|
161 |
|
162 |
# Prepare content for the prompt
|
163 |
+
content_parts = []
|
164 |
if metadata['title']:
|
165 |
+
content_parts.append(f"Title: {metadata['title']}")
|
166 |
if metadata['description']:
|
167 |
+
content_parts.append(f"Description: {metadata['description']}")
|
168 |
if metadata['keywords']:
|
169 |
+
content_parts.append(f"Keywords: {metadata['keywords']}")
|
170 |
if main_content:
|
171 |
+
content_parts.append(f"Main Content: {main_content}")
|
172 |
|
173 |
+
content_text = '\n'.join(content_parts)
|
174 |
|
175 |
+
# Detect insufficient or erroneous content
|
176 |
+
error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic', 'Page Not Found', '404 Not Found', 'Forbidden']
|
177 |
+
if not content_text or len(content_text.split()) < 50 or any(keyword.lower() in content_text.lower() for keyword in error_keywords):
|
178 |
+
use_prior_knowledge = True
|
179 |
+
logger.info(f"Content for {bookmark.get('url')} is insufficient or contains error messages. Instructing LLM to use prior knowledge.")
|
180 |
+
else:
|
181 |
+
use_prior_knowledge = False
|
182 |
+
|
183 |
+
if use_prior_knowledge:
|
184 |
+
# Construct prompt to use prior knowledge
|
185 |
+
prompt = f"""
|
186 |
+
You are a knowledgeable assistant.
|
187 |
+
|
188 |
+
The user provided a URL: {bookmark.get('url')}
|
189 |
+
|
190 |
+
Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
|
191 |
+
|
192 |
+
Focus on:
|
193 |
+
- The main purpose or topic of the website.
|
194 |
+
- Key information or features.
|
195 |
+
- Target audience or use case (if apparent).
|
196 |
+
|
197 |
+
Be factual and objective.
|
198 |
+
"""
|
199 |
+
else:
|
200 |
+
# Construct the prompt with the extracted content
|
201 |
+
prompt = f"""
|
202 |
You are a helpful assistant that creates concise webpage summaries.
|
203 |
|
204 |
Analyze the following webpage content:
|
205 |
|
206 |
{content_text}
|
207 |
|
208 |
+
If the content is insufficient or seems to be an error page, please use your own knowledge to provide an accurate summary.
|
209 |
+
|
210 |
Provide a concise summary (2-3 sentences) focusing on:
|
211 |
- The main purpose or topic of the page.
|
212 |
- Key information or features.
|
213 |
- Target audience or use case (if apparent).
|
214 |
|
|
|
|
|
215 |
Be factual and objective.
|
216 |
"""
|
217 |
|
218 |
# Call the LLM via Groq Cloud API
|
219 |
response = openai.ChatCompletion.create(
|
220 |
+
model='llama-3.1-70b-versatile',
|
221 |
messages=[
|
222 |
{"role": "user", "content": prompt}
|
223 |
],
|
|
|
234 |
|
235 |
except Exception as e:
|
236 |
logger.error(f"Error generating summary: {e}", exc_info=True)
|
237 |
+
bookmark['summary'] = 'No summary available.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
return bookmark
|
239 |
|
240 |
def parse_bookmarks(file_content):
|
|
|
273 |
'Chrome/91.0.4472.124 Safari/537.36',
|
274 |
'Accept-Language': 'en-US,en;q=0.9',
|
275 |
}
|
276 |
+
async with session.get(url, timeout=20, headers=headers, ssl=False, allow_redirects=True) as response:
|
277 |
bookmark['etag'] = response.headers.get('ETag', 'N/A')
|
278 |
bookmark['status_code'] = response.status
|
279 |
|
280 |
content = await response.text()
|
281 |
+
logger.info(f"Fetched content length for {url}: {len(content)} characters")
|
282 |
|
283 |
+
# Handle status codes
|
284 |
if response.status >= 500:
|
285 |
# Server error, consider as dead link
|
286 |
bookmark['dead_link'] = True
|
|
|
346 |
# Prepare the prompt
|
347 |
categories_str = ', '.join([f'"{cat}"' for cat in CATEGORIES if cat != 'Dead Link'])
|
348 |
prompt = f"""
|
349 |
+
You are a helpful assistant that categorizes webpages.
|
350 |
+
|
351 |
Based on the following summary, assign the most appropriate category from the list below.
|
352 |
|
353 |
Summary:
|
|
|
361 |
|
362 |
try:
|
363 |
response = openai.ChatCompletion.create(
|
364 |
+
model='llama-3.1-70b-versatile',
|
365 |
messages=[
|
366 |
{"role": "user", "content": prompt}
|
367 |
],
|
|
|
638 |
"""
|
639 |
|
640 |
response = openai.ChatCompletion.create(
|
641 |
+
model='llama-3.1-70b-versatile',
|
642 |
messages=[
|
643 |
{"role": "user", "content": prompt}
|
644 |
],
|