Shreyas094
commited on
Commit
•
b864c9d
1
Parent(s):
5239e89
Update app.py
Browse files
app.py
CHANGED
@@ -475,9 +475,9 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
475 |
return "Error: Unable to generate a summary. Please try again."
|
476 |
|
477 |
def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
|
478 |
-
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
|
479 |
|
480 |
-
|
481 |
# Step 1: Rephrase the Query
|
482 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
483 |
logger.info(f"Rephrased Query: {rephrased_query}")
|
@@ -489,74 +489,76 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
489 |
# Step 2: Perform search
|
490 |
if use_duckduckgo:
|
491 |
search_results = duckduckgo_search(rephrased_query, num_results, time_range, language, safesearch)
|
|
|
492 |
else:
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
|
|
|
|
|
|
503 |
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
'Sec-Fetch-Mode': 'cors',
|
523 |
-
'Sec-Fetch-Site': 'same-origin',
|
524 |
-
}
|
525 |
|
526 |
scraped_content = []
|
527 |
page = 1
|
528 |
while len(scraped_content) < num_results:
|
529 |
-
|
530 |
-
|
|
|
531 |
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
|
547 |
-
|
548 |
-
|
549 |
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
|
555 |
for result in results:
|
556 |
if len(scraped_content) >= num_results:
|
557 |
break
|
558 |
|
559 |
-
url = result.get('url', '')
|
560 |
title = result.get('title', 'No title')
|
561 |
|
562 |
if not is_valid_url(url):
|
@@ -584,7 +586,10 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
584 |
except Exception as e:
|
585 |
logger.error(f"Unexpected error while scraping {url}: {e}")
|
586 |
|
587 |
-
|
|
|
|
|
|
|
588 |
|
589 |
if not scraped_content:
|
590 |
logger.warning("No content scraped from search results.")
|
|
|
475 |
return "Error: Unable to generate a summary. Please try again."
|
476 |
|
477 |
def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
|
478 |
+
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, use_duckduckgo=False):
|
479 |
|
480 |
+
try:
|
481 |
# Step 1: Rephrase the Query
|
482 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
483 |
logger.info(f"Rephrased Query: {rephrased_query}")
|
|
|
489 |
# Step 2: Perform search
|
490 |
if use_duckduckgo:
|
491 |
search_results = duckduckgo_search(rephrased_query, num_results, time_range, language, safesearch)
|
492 |
+
results = search_results # Assign DuckDuckGo results directly
|
493 |
else:
|
494 |
+
# Search query parameters
|
495 |
+
params = {
|
496 |
+
'q': rephrased_query,
|
497 |
+
'format': 'json',
|
498 |
+
'time_range': time_range,
|
499 |
+
'language': language,
|
500 |
+
'category': category,
|
501 |
+
'engines': ','.join(engines),
|
502 |
+
'safesearch': safesearch
|
503 |
+
}
|
504 |
+
|
505 |
+
# Remove empty parameters
|
506 |
+
params = {k: v for k, v in params.items() if v != ""}
|
507 |
|
508 |
+
# If no engines are specified, set default engines
|
509 |
+
if 'engines' not in params:
|
510 |
+
params['engines'] = 'google' # Default to 'google' or any preferred engine
|
511 |
+
logger.info("No engines specified. Defaulting to 'google'.")
|
512 |
+
|
513 |
+
# Headers for SearXNG request
|
514 |
+
headers = {
|
515 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
516 |
+
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
517 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
518 |
+
'Origin': 'https://shreyas094-searxng-local.hf.space',
|
519 |
+
'Referer': 'https://shreyas094-searxng-local.hf.space/',
|
520 |
+
'DNT': '1',
|
521 |
+
'Connection': 'keep-alive',
|
522 |
+
'Sec-Fetch-Dest': 'empty',
|
523 |
+
'Sec-Fetch-Mode': 'cors',
|
524 |
+
'Sec-Fetch-Site': 'same-origin',
|
525 |
+
}
|
|
|
|
|
|
|
526 |
|
527 |
scraped_content = []
|
528 |
page = 1
|
529 |
while len(scraped_content) < num_results:
|
530 |
+
if not use_duckduckgo:
|
531 |
+
# Update params with current page
|
532 |
+
params['pageno'] = page
|
533 |
|
534 |
+
# Send request to SearXNG
|
535 |
+
logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
|
536 |
+
session = requests_retry_session()
|
537 |
|
538 |
+
try:
|
539 |
+
if method.upper() == "GET":
|
540 |
+
response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
|
541 |
+
else: # POST
|
542 |
+
response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
|
543 |
+
|
544 |
+
response.raise_for_status()
|
545 |
+
except requests.exceptions.RequestException as e:
|
546 |
+
logger.error(f"Error during SearXNG request: {e}")
|
547 |
+
return f"An error occurred during the search request: {e}"
|
548 |
|
549 |
+
search_results = response.json()
|
550 |
+
logger.debug(f"SearXNG Response: {search_results}")
|
551 |
|
552 |
+
results = search_results.get('results', [])
|
553 |
+
if not results:
|
554 |
+
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
555 |
+
break
|
556 |
|
557 |
for result in results:
|
558 |
if len(scraped_content) >= num_results:
|
559 |
break
|
560 |
|
561 |
+
url = result.get('url', '') if not use_duckduckgo else result.get('href', '')
|
562 |
title = result.get('title', 'No title')
|
563 |
|
564 |
if not is_valid_url(url):
|
|
|
586 |
except Exception as e:
|
587 |
logger.error(f"Unexpected error while scraping {url}: {e}")
|
588 |
|
589 |
+
if use_duckduckgo:
|
590 |
+
break # DuckDuckGo search doesn't support pagination in this implementation
|
591 |
+
else:
|
592 |
+
page += 1
|
593 |
|
594 |
if not scraped_content:
|
595 |
logger.warning("No content scraped from search results.")
|