raghavNCI commited on
Commit
2e99a5a
·
1 Parent(s): 89716e4

text extractor changes

Browse files
Files changed (1) hide show
  1. nuse_modules/google_search.py +13 -6
nuse_modules/google_search.py CHANGED
@@ -4,7 +4,17 @@ import os
4
  import requests
5
  import time
6
  from typing import List
7
- from boilerpy3 import extractors # ← switched library
 
 
 
 
 
 
 
 
 
 
8
 
9
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
10
  GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
@@ -14,12 +24,9 @@ article_extractor = extractors.ArticleExtractor()
14
 
15
 
16
  def extract_full_text(url: str) -> str:
17
- """
18
- Download a page and return its readable main text.
19
- Falls back to empty string on any failure.
20
- """
21
  try:
22
- return article_extractor.get_content_from_url(url) or ""
 
23
  except Exception as e:
24
  print(f"[SCRAPER ERROR] {url}: {e}")
25
  return ""
 
4
  import requests
5
  import time
6
  from typing import List
7
+ from boilerpy3 import extractors
8
+
9
+ article_extractor = extractors.ArticleExtractor()
10
+
11
+ HEADERS = {
12
+ "User-Agent": (
13
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
14
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
15
+ "Chrome/114.0.0.0 Safari/537.36"
16
+ )
17
+ }
18
 
19
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
20
  GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
 
24
 
25
 
26
  def extract_full_text(url: str) -> str:
 
 
 
 
27
  try:
28
+ html = requests.get(url, headers=HEADERS, timeout=10).text
29
+ return article_extractor.get_content(html) or ""
30
  except Exception as e:
31
  print(f"[SCRAPER ERROR] {url}: {e}")
32
  return ""