Sentinel-AI-Web-Search-Test-v2

Sleeping

App Files Files Community

Shreyas094 commited on Jun 29, 2024

Commit

328806f

verified ·

1 Parent(s): 769a423

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -16

app.py CHANGED Viewed

@@ -84,16 +84,14 @@ def extract_text_from_webpage(html):
     return text
 # Function to perform a Google search and retrieve results
-def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
-    """Performs a Google search and returns the results."""
     print(f"Searching for term: {term}")
     escaped_term = urllib.parse.quote_plus(term)
     start = 0
     all_results = []
-    max_chars_per_page = 8000  # Limit the number of characters from each webpage to stay under the token limit
     with requests.Session() as session:
-        while start < num_results:
             print(f"Fetching search results starting from: {start}")
             try:
                 # Choose a random user agent
@@ -129,6 +127,8 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
             keywords = term.split()  # Use the search term as keywords for filtering
             for result in result_block:
                 link = result.find("a", href=True)
                 if link:
                     link = link["href"]
@@ -138,22 +138,49 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
                         webpage.raise_for_status()
                         visible_text = extract_text_from_webpage(webpage.text)
-                        # Apply preprocessing to the visible text
-                        preprocessed_text = preprocess_web_content(visible_text, keywords)
-                        if len(preprocessed_text) > max_chars_per_page:
-                            preprocessed_text = preprocessed_text[:max_chars_per_page] + "..."
-                        all_results.append({"link": link, "text": preprocessed_text})
                     except requests.exceptions.RequestException as e:
                         print(f"Error fetching or processing {link}: {e}")
                         all_results.append({"link": link, "text": None})
                 else:
                     print("No link found in result.")
                     all_results.append({"link": None, "text": None})
             start += len(result_block)
     print(f"Total results fetched: {len(all_results)}")
     return all_results
 def preprocess_text(text):
     # Remove HTML tags
     text = BeautifulSoup(text, "html.parser").get_text()
@@ -244,19 +271,19 @@ def format_prompt(query, search_results, instructions):
     formatted_results = ""
     for result in search_results:
         link = result["link"]
-        text = result["text"]
-        if link:
-            formatted_results += f"URL: {link}\nContent: {text}\n{'-' * 80}\n"
         else:
-            formatted_results += "No link found.\n" + '-' * 80 + '\n'
     prompt = f"""Instructions: {instructions}
 User Query: {query}
-Web Search Results:
 {formatted_results}
-Important: Provide a precise and factual response based solely on the information given above. Include specific dates, numbers, and sources where available. If exact information is not provided in the search results, clearly state that the information is not available in the given context. Do not make assumptions or provide information that is not directly supported by the search results.
 Assistant:"""
     return prompt
@@ -385,7 +412,7 @@ def save_text_to_pdf(text, output_path):
 def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
     print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
     if web_search:
-        search_results = google_search(query, num_results)
         formatted_prompt = format_prompt(query, search_results, instructions)
         generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
     else:

     return text
 # Function to perform a Google search and retrieve results
+def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, instructions=""):
     print(f"Searching for term: {term}")
     escaped_term = urllib.parse.quote_plus(term)
     start = 0
     all_results = []
     with requests.Session() as session:
+        while len(all_results) < num_results:
             print(f"Fetching search results starting from: {start}")
             try:
                 # Choose a random user agent
             keywords = term.split()  # Use the search term as keywords for filtering
             for result in result_block:
+                if len(all_results) >= num_results:
+                    break
                 link = result.find("a", href=True)
                 if link:
                     link = link["href"]
                         webpage.raise_for_status()
                         visible_text = extract_text_from_webpage(webpage.text)
+                        # Summarize the webpage content
+                        summary = summarize_webpage(link, visible_text, term, instructions)
+                        all_results.append({"link": link, "text": summary})
                     except requests.exceptions.RequestException as e:
                         print(f"Error fetching or processing {link}: {e}")
                         all_results.append({"link": link, "text": None})
                 else:
                     print("No link found in result.")
                     all_results.append({"link": None, "text": None})
             start += len(result_block)
     print(f"Total results fetched: {len(all_results)}")
     return all_results
+def summarize_webpage(url, content, query, instructions, max_chars=1000):
+    # Preprocess the content
+    preprocessed_text = preprocess_web_content(content, query.split())
+    # Format a prompt for this specific webpage
+    webpage_prompt = f"""
+    Instructions: {instructions}
+    Query: {query}
+    URL: {url}
+    Webpage content:
+    {preprocessed_text}
+    Summarize the above content in relation to the query. Focus on relevant information and include any specific data or facts mentioned. Keep the summary concise, ideally under 200 words.
+    Summary:
+    """
+    # Generate summary using the AI model
+    summary = generate_text(webpage_prompt, temperature=0.3, repetition_penalty=1.2, top_p=0.9)
+    # Truncate if necessary
+    if summary and len(summary) > max_chars:
+        summary = summary[:max_chars] + "..."
+    return summary
 def preprocess_text(text):
     # Remove HTML tags
     text = BeautifulSoup(text, "html.parser").get_text()
     formatted_results = ""
     for result in search_results:
         link = result["link"]
+        summary = result["text"]
+        if link and summary:
+            formatted_results += f"URL: {link}\nSummary: {summary}\n{'-' * 80}\n"
         else:
+            formatted_results += "No relevant information found.\n" + '-' * 80 + '\n'
     prompt = f"""Instructions: {instructions}
 User Query: {query}
+Summarized Web Search Results:
 {formatted_results}
+Based on the above summarized information from multiple sources, provide a comprehensive and factual response to the user's query. Include specific dates, numbers, and sources where available. If information is conflicting or unclear, mention this in your response. Do not make assumptions or provide information that is not supported by the summaries.
 Assistant:"""
     return prompt
 def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
     print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
     if web_search:
+        search_results = google_search(query, num_results, instructions=instructions)
         formatted_prompt = format_prompt(query, search_results, instructions)
         generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
     else: