Shreyas094 commited on
Commit
4706059
1 Parent(s): 9988100

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -83
app.py CHANGED
@@ -299,6 +299,11 @@ Please provide a comprehensive summary based on the above instructions:
299
  logger.error(f"Error in LLM summarization: {e}")
300
  return "Error: Unable to generate a summary. Please try again."
301
 
 
 
 
 
 
302
  def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
303
  engines=[], safesearch=2, method="GET", llm_temperature=0.2):
304
  try:
@@ -314,7 +319,6 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
314
  params = {
315
  'q': rephrased_query,
316
  'format': 'json',
317
- 'num_results': num_results,
318
  'time_range': time_range,
319
  'language': language,
320
  'category': category,
@@ -344,101 +348,111 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
344
  'Sec-Fetch-Site': 'same-origin',
345
  }
346
 
347
- # Send request to SearXNG
348
- logger.info(f"Sending request to SearXNG for query: {rephrased_query}")
349
- session = requests_retry_session()
 
 
350
 
351
- try:
352
- if method.upper() == "GET":
353
- response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
354
- else: # POST
355
- response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
356
-
357
- response.raise_for_status()
358
- except requests.exceptions.RequestException as e:
359
- logger.error(f"Error during SearXNG request: {e}")
360
- return f"An error occurred during the search request: {e}"
361
 
362
- search_results = response.json()
363
- logger.debug(f"SearXNG Response: {search_results}")
 
 
 
 
 
 
 
 
364
 
365
- num_received = len(search_results.get('results', []))
366
- logger.info(f"Received {num_received} results from SearXNG")
367
 
368
- if num_received == 0:
369
- logger.warning("No results returned from SearXNG.")
370
- return "No results found for the given query."
 
371
 
372
- scraped_content = []
 
 
373
 
374
- for result in search_results.get('results', [])[:num_results]:
375
- url = result.get('url', '')
376
- title = result.get('title', 'No title')
377
 
378
- if not is_valid_url(url):
379
- logger.warning(f"Invalid URL: {url}")
380
- continue
381
 
382
- try:
383
- logger.info(f"Scraping content from: {url}")
384
-
385
- # Implement a retry mechanism with different user agents
386
- user_agents = [
387
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
388
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
389
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
390
- ]
391
-
392
- content = ""
393
- for ua in user_agents:
394
- try:
395
- if scraper == "bs4":
396
- session.headers.update({'User-Agent': ua})
397
- content = scrape_with_bs4(url, session)
398
- else: # trafilatura
399
- # Use urllib to handle custom headers for trafilatura
400
- req = Request(url, headers={'User-Agent': ua})
401
- with urlopen(req) as response:
402
- downloaded = response.read()
403
-
404
- # Configure trafilatura to use a specific user agent
405
- config = use_config()
406
- config.set("DEFAULT", "USER_AGENT", ua)
 
 
407
 
408
- content = extract(downloaded, config=config)
409
-
410
- if content:
411
- break
412
- except requests.exceptions.HTTPError as e:
413
- if e.response.status_code == 403:
414
- logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
 
 
 
415
  continue
416
- else:
417
- raise
418
- except Exception as e:
419
- logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
420
  continue
421
-
422
- if not content:
423
- logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
424
- continue
425
-
426
- # Limit content to max_chars
427
- scraped_content.append({
428
- "title": title,
429
- "url": url,
430
- "content": content[:max_chars],
431
- "scraper": scraper
432
- })
433
- except requests.exceptions.RequestException as e:
434
- logger.error(f"Error scraping {url}: {e}")
435
- except Exception as e:
436
- logger.error(f"Unexpected error while scraping {url}: {e}")
437
 
438
  if not scraped_content:
439
  logger.warning("No content scraped from search results.")
440
  return "No content could be scraped from the search results."
441
 
 
 
442
  # Step 3: Assess relevance, summarize, and check for uniqueness
443
  relevant_documents = []
444
  unique_summaries = []
@@ -473,13 +487,13 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
473
 
474
  logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
475
 
476
- # Step 5: Scrape full content for top 5 documents
477
- for doc in reranked_docs[:5]:
478
  full_content = scrape_full_content(doc['url'], scraper, max_chars)
479
  doc['full_content'] = full_content
480
 
481
  # Step 6: LLM Summarization
482
- llm_summary = llm_summarize(query, reranked_docs[:5], client, temperature=llm_temperature)
483
 
484
  return llm_summary
485
 
 
299
  logger.error(f"Error in LLM summarization: {e}")
300
  return "Error: Unable to generate a summary. Please try again."
301
 
302
+ import requests
303
+ from trafilatura import extract
304
+ from trafilatura.settings import use_config
305
+ from urllib.request import urlopen, Request
306
+
307
  def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
308
  engines=[], safesearch=2, method="GET", llm_temperature=0.2):
309
  try:
 
319
  params = {
320
  'q': rephrased_query,
321
  'format': 'json',
 
322
  'time_range': time_range,
323
  'language': language,
324
  'category': category,
 
348
  'Sec-Fetch-Site': 'same-origin',
349
  }
350
 
351
+ scraped_content = []
352
+ page = 1
353
+ while len(scraped_content) < num_results:
354
+ # Update params with current page
355
+ params['pageno'] = page
356
 
357
+ # Send request to SearXNG
358
+ logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
359
+ session = requests_retry_session()
 
 
 
 
 
 
 
360
 
361
+ try:
362
+ if method.upper() == "GET":
363
+ response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
364
+ else: # POST
365
+ response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
366
+
367
+ response.raise_for_status()
368
+ except requests.exceptions.RequestException as e:
369
+ logger.error(f"Error during SearXNG request: {e}")
370
+ return f"An error occurred during the search request: {e}"
371
 
372
+ search_results = response.json()
373
+ logger.debug(f"SearXNG Response: {search_results}")
374
 
375
+ results = search_results.get('results', [])
376
+ if not results:
377
+ logger.warning(f"No more results returned from SearXNG on page {page}.")
378
+ break
379
 
380
+ for result in results:
381
+ if len(scraped_content) >= num_results:
382
+ break
383
 
384
+ url = result.get('url', '')
385
+ title = result.get('title', 'No title')
 
386
 
387
+ if not is_valid_url(url):
388
+ logger.warning(f"Invalid URL: {url}")
389
+ continue
390
 
391
+ try:
392
+ logger.info(f"Scraping content from: {url}")
393
+
394
+ # Implement a retry mechanism with different user agents
395
+ user_agents = [
396
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
397
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
398
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
399
+ ]
400
+
401
+ content = ""
402
+ for ua in user_agents:
403
+ try:
404
+ if scraper == "bs4":
405
+ session.headers.update({'User-Agent': ua})
406
+ content = scrape_with_bs4(url, session)
407
+ else: # trafilatura
408
+ # Use urllib to handle custom headers for trafilatura
409
+ req = Request(url, headers={'User-Agent': ua})
410
+ with urlopen(req) as response:
411
+ downloaded = response.read()
412
+
413
+ # Configure trafilatura to use a specific user agent
414
+ config = use_config()
415
+ config.set("DEFAULT", "USER_AGENT", ua)
416
+
417
+ content = extract(downloaded, config=config)
418
 
419
+ if content:
420
+ break
421
+ except requests.exceptions.HTTPError as e:
422
+ if e.response.status_code == 403:
423
+ logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
424
+ continue
425
+ else:
426
+ raise
427
+ except Exception as e:
428
+ logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
429
  continue
430
+
431
+ if not content:
432
+ logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
 
433
  continue
434
+
435
+ # Limit content to max_chars
436
+ scraped_content.append({
437
+ "title": title,
438
+ "url": url,
439
+ "content": content[:max_chars],
440
+ "scraper": scraper
441
+ })
442
+ logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
443
+ except requests.exceptions.RequestException as e:
444
+ logger.error(f"Error scraping {url}: {e}")
445
+ except Exception as e:
446
+ logger.error(f"Unexpected error while scraping {url}: {e}")
447
+
448
+ page += 1
 
449
 
450
  if not scraped_content:
451
  logger.warning("No content scraped from search results.")
452
  return "No content could be scraped from the search results."
453
 
454
+ logger.info(f"Successfully scraped {len(scraped_content)} documents.")
455
+
456
  # Step 3: Assess relevance, summarize, and check for uniqueness
457
  relevant_documents = []
458
  unique_summaries = []
 
487
 
488
  logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
489
 
490
+ # Step 5: Scrape full content for top documents (up to num_results)
491
+ for doc in reranked_docs[:num_results]:
492
  full_content = scrape_full_content(doc['url'], scraper, max_chars)
493
  doc['full_content'] = full_content
494
 
495
  # Step 6: LLM Summarization
496
+ llm_summary = llm_summarize(query, reranked_docs[:num_results], client, temperature=llm_temperature)
497
 
498
  return llm_summary
499