raghavNCI
commited on
Commit
·
2e99a5a
1
Parent(s):
89716e4
text extractor changes
Browse files
nuse_modules/google_search.py
CHANGED
@@ -4,7 +4,17 @@ import os
|
|
4 |
import requests
|
5 |
import time
|
6 |
from typing import List
|
7 |
-
from boilerpy3 import extractors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
10 |
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
|
@@ -14,12 +24,9 @@ article_extractor = extractors.ArticleExtractor()
|
|
14 |
|
15 |
|
16 |
def extract_full_text(url: str) -> str:
|
17 |
-
"""
|
18 |
-
Download a page and return its readable main text.
|
19 |
-
Falls back to empty string on any failure.
|
20 |
-
"""
|
21 |
try:
|
22 |
-
|
|
|
23 |
except Exception as e:
|
24 |
print(f"[SCRAPER ERROR] {url}: {e}")
|
25 |
return ""
|
|
|
4 |
import requests
|
5 |
import time
|
6 |
from typing import List
|
7 |
+
from boilerpy3 import extractors
|
8 |
+
|
9 |
+
article_extractor = extractors.ArticleExtractor()
|
10 |
+
|
11 |
+
HEADERS = {
|
12 |
+
"User-Agent": (
|
13 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
14 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
15 |
+
"Chrome/114.0.0.0 Safari/537.36"
|
16 |
+
)
|
17 |
+
}
|
18 |
|
19 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
20 |
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
|
|
|
24 |
|
25 |
|
26 |
def extract_full_text(url: str) -> str:
|
|
|
|
|
|
|
|
|
27 |
try:
|
28 |
+
html = requests.get(url, headers=HEADERS, timeout=10).text
|
29 |
+
return article_extractor.get_content(html) or ""
|
30 |
except Exception as e:
|
31 |
print(f"[SCRAPER ERROR] {url}: {e}")
|
32 |
return ""
|