Update app.py
Browse files
app.py
CHANGED
@@ -84,16 +84,14 @@ def extract_text_from_webpage(html):
|
|
84 |
return text
|
85 |
|
86 |
# Function to perform a Google search and retrieve results
|
87 |
-
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
|
88 |
-
"""Performs a Google search and returns the results."""
|
89 |
print(f"Searching for term: {term}")
|
90 |
escaped_term = urllib.parse.quote_plus(term)
|
91 |
start = 0
|
92 |
all_results = []
|
93 |
-
max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
|
94 |
|
95 |
with requests.Session() as session:
|
96 |
-
while
|
97 |
print(f"Fetching search results starting from: {start}")
|
98 |
try:
|
99 |
# Choose a random user agent
|
@@ -129,6 +127,8 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
|
|
129 |
keywords = term.split() # Use the search term as keywords for filtering
|
130 |
|
131 |
for result in result_block:
|
|
|
|
|
132 |
link = result.find("a", href=True)
|
133 |
if link:
|
134 |
link = link["href"]
|
@@ -138,22 +138,49 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
|
|
138 |
webpage.raise_for_status()
|
139 |
visible_text = extract_text_from_webpage(webpage.text)
|
140 |
|
141 |
-
#
|
142 |
-
|
143 |
|
144 |
-
|
145 |
-
preprocessed_text = preprocessed_text[:max_chars_per_page] + "..."
|
146 |
-
all_results.append({"link": link, "text": preprocessed_text})
|
147 |
except requests.exceptions.RequestException as e:
|
148 |
print(f"Error fetching or processing {link}: {e}")
|
149 |
all_results.append({"link": link, "text": None})
|
150 |
else:
|
151 |
print("No link found in result.")
|
152 |
all_results.append({"link": None, "text": None})
|
|
|
153 |
start += len(result_block)
|
|
|
154 |
print(f"Total results fetched: {len(all_results)}")
|
155 |
return all_results
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
def preprocess_text(text):
|
158 |
# Remove HTML tags
|
159 |
text = BeautifulSoup(text, "html.parser").get_text()
|
@@ -244,19 +271,19 @@ def format_prompt(query, search_results, instructions):
|
|
244 |
formatted_results = ""
|
245 |
for result in search_results:
|
246 |
link = result["link"]
|
247 |
-
|
248 |
-
if link:
|
249 |
-
formatted_results += f"URL: {link}\
|
250 |
else:
|
251 |
-
formatted_results += "No
|
252 |
|
253 |
prompt = f"""Instructions: {instructions}
|
254 |
User Query: {query}
|
255 |
|
256 |
-
Web Search Results:
|
257 |
{formatted_results}
|
258 |
|
259 |
-
|
260 |
|
261 |
Assistant:"""
|
262 |
return prompt
|
@@ -385,7 +412,7 @@ def save_text_to_pdf(text, output_path):
|
|
385 |
def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
|
386 |
print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
|
387 |
if web_search:
|
388 |
-
search_results = google_search(query, num_results)
|
389 |
formatted_prompt = format_prompt(query, search_results, instructions)
|
390 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
391 |
else:
|
|
|
84 |
return text
|
85 |
|
86 |
# Function to perform a Google search and retrieve results
|
87 |
+
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, instructions=""):
|
|
|
88 |
print(f"Searching for term: {term}")
|
89 |
escaped_term = urllib.parse.quote_plus(term)
|
90 |
start = 0
|
91 |
all_results = []
|
|
|
92 |
|
93 |
with requests.Session() as session:
|
94 |
+
while len(all_results) < num_results:
|
95 |
print(f"Fetching search results starting from: {start}")
|
96 |
try:
|
97 |
# Choose a random user agent
|
|
|
127 |
keywords = term.split() # Use the search term as keywords for filtering
|
128 |
|
129 |
for result in result_block:
|
130 |
+
if len(all_results) >= num_results:
|
131 |
+
break
|
132 |
link = result.find("a", href=True)
|
133 |
if link:
|
134 |
link = link["href"]
|
|
|
138 |
webpage.raise_for_status()
|
139 |
visible_text = extract_text_from_webpage(webpage.text)
|
140 |
|
141 |
+
# Summarize the webpage content
|
142 |
+
summary = summarize_webpage(link, visible_text, term, instructions)
|
143 |
|
144 |
+
all_results.append({"link": link, "text": summary})
|
|
|
|
|
145 |
except requests.exceptions.RequestException as e:
|
146 |
print(f"Error fetching or processing {link}: {e}")
|
147 |
all_results.append({"link": link, "text": None})
|
148 |
else:
|
149 |
print("No link found in result.")
|
150 |
all_results.append({"link": None, "text": None})
|
151 |
+
|
152 |
start += len(result_block)
|
153 |
+
|
154 |
print(f"Total results fetched: {len(all_results)}")
|
155 |
return all_results
|
156 |
|
157 |
+
def summarize_webpage(url, content, query, instructions, max_chars=1000):
|
158 |
+
# Preprocess the content
|
159 |
+
preprocessed_text = preprocess_web_content(content, query.split())
|
160 |
+
|
161 |
+
# Format a prompt for this specific webpage
|
162 |
+
webpage_prompt = f"""
|
163 |
+
Instructions: {instructions}
|
164 |
+
Query: {query}
|
165 |
+
URL: {url}
|
166 |
+
|
167 |
+
Webpage content:
|
168 |
+
{preprocessed_text}
|
169 |
+
|
170 |
+
Summarize the above content in relation to the query. Focus on relevant information and include any specific data or facts mentioned. Keep the summary concise, ideally under 200 words.
|
171 |
+
|
172 |
+
Summary:
|
173 |
+
"""
|
174 |
+
|
175 |
+
# Generate summary using the AI model
|
176 |
+
summary = generate_text(webpage_prompt, temperature=0.3, repetition_penalty=1.2, top_p=0.9)
|
177 |
+
|
178 |
+
# Truncate if necessary
|
179 |
+
if summary and len(summary) > max_chars:
|
180 |
+
summary = summary[:max_chars] + "..."
|
181 |
+
|
182 |
+
return summary
|
183 |
+
|
184 |
def preprocess_text(text):
|
185 |
# Remove HTML tags
|
186 |
text = BeautifulSoup(text, "html.parser").get_text()
|
|
|
271 |
formatted_results = ""
|
272 |
for result in search_results:
|
273 |
link = result["link"]
|
274 |
+
summary = result["text"]
|
275 |
+
if link and summary:
|
276 |
+
formatted_results += f"URL: {link}\nSummary: {summary}\n{'-' * 80}\n"
|
277 |
else:
|
278 |
+
formatted_results += "No relevant information found.\n" + '-' * 80 + '\n'
|
279 |
|
280 |
prompt = f"""Instructions: {instructions}
|
281 |
User Query: {query}
|
282 |
|
283 |
+
Summarized Web Search Results:
|
284 |
{formatted_results}
|
285 |
|
286 |
+
Based on the above summarized information from multiple sources, provide a comprehensive and factual response to the user's query. Include specific dates, numbers, and sources where available. If information is conflicting or unclear, mention this in your response. Do not make assumptions or provide information that is not supported by the summaries.
|
287 |
|
288 |
Assistant:"""
|
289 |
return prompt
|
|
|
412 |
def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
|
413 |
print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
|
414 |
if web_search:
|
415 |
+
search_results = google_search(query, num_results, instructions=instructions)
|
416 |
formatted_prompt = format_prompt(query, search_results, instructions)
|
417 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
418 |
else:
|