File size: 7,613 Bytes
48d4d11
70d74f0
 
 
 
48d4d11
43d4e83
a62cc34
d904dd4
48d4d11
 
ca02509
bf91121
 
 
 
 
 
43d4e83
59fbf6a
43d4e83
59fbf6a
 
43d4e83
59fbf6a
 
 
 
43d4e83
 
59fbf6a
 
43d4e83
bf91121
 
43d4e83
59fbf6a
43d4e83
59fbf6a
d904dd4
 
 
 
 
 
 
 
 
 
 
59fbf6a
43d4e83
 
59fbf6a
 
 
 
 
 
43d4e83
 
 
 
 
 
 
 
 
bf91121
43d4e83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70d74f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5650543
70d74f0
 
 
 
 
 
 
 
6f4a113
bf1e0a0
 
70d74f0
a62cc34
d904dd4
a62cc34
d904dd4
a62cc34
d904dd4
a62cc34
d904dd4
 
 
 
 
 
70d74f0
 
 
a62cc34
 
 
 
 
 
d904dd4
 
 
 
 
 
 
 
 
 
a62cc34
 
d904dd4
a62cc34
 
70d74f0
bf1e0a0
 
 
 
70d74f0
 
 
 
 
 
 
f14cff1
 
 
 
 
 
 
 
70d74f0
 
 
f14cff1
70d74f0
 
5650543
 
 
70d74f0
 
5650543
 
 
70d74f0
 
5650543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70d74f0
 
fa3e7dd
48d4d11
 
70d74f0
fa3e7dd
 
70d74f0
f14cff1
70d74f0
f14cff1
70d74f0
 
 
5650543
 
 
 
43d4e83
 
70d74f0
5650543
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import os
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from dotenv import load_dotenv
import requests
import fitz
from trafilatura import extract

load_dotenv()

API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")

# Number of pages to scrape
NUM_PAGES = 10

def build_results_beautifulsoup(url_list):
    print("Starting to scrape URLs...")
    start_time = time.perf_counter()

    # scrape URLs in list
    soups = asyncio.run(parallel_scrap(url_list))

    scraping_time = time.perf_counter() - start_time
    print(f"Scraping processing time: {scraping_time:.2f} seconds")

    result_content = {}
    count = 0

    print("Starting to process each URL...")
    for url, soup in zip(url_list, soups):
        if count >= NUM_PAGES:
            print(f"Reached the limit of {NUM_PAGES} pages. Stopping processing.")
            break

        if soup:
            print(f"Processing URL: {url}")
            
            text = extract(
                soup,
                include_tables=False,
                include_comments=False,
                output_format="txt",
            )
            # If text is None or empty, log a warning and skip
            if text is None:
                print(f"Warning: Extraction returned None for URL: {url}")
            elif len(text) > 500:
                print(f"Adding content from URL: {url}, content length: {len(text)}")
                result_content[url] = text
                count += 1
            else:
                print(f"Skipped URL: {url}, content too short (length: {len(text)})")
        else:
            print(f"Skipped URL: {url}, no soup content available.")

    print("Finished processing URLs.")
    return result_content


def build_results_extractor(url_list):
    try:
        endpoint = "https://extractorapi.com/api/v1/extractor"
        result_content = {}
        count = 0
        for url in url_list:
            if count >= NUM_PAGES:
                break
            params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
            r = requests.get(endpoint, params=params)
            if r.status_code == 200:
                text = r.json()["text"]
                if len(text) > 500:
                    result_content[url] = text
                    count += 1
            if r.status_code == 403:
                raise Exception(f"Error with API; using default implementaion instead")
        return result_content

    except Exception as e:
        print(e)
        return build_results_beautifulsoup(url_list)


months = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12",
}

domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
skip_urls = ["youtube.com", "twitter.com", "facebook.com", "instagram.com", "x.com"]


def build_date(year=2024, month="March", day=1):
    return f"{year}{months[month]}{day}"


async def get_url_data(url, client):
    try:
        r = await client.get(url, follow_redirects=True)
        print(f"URL: {url}, Response Code: {r.status_code}")

        if r.status_code == 200:
            content_type = r.headers.get("Content-Type", "").lower()
            # Improved PDF detection using Content-Type and file extension
            if "application/pdf" in content_type or url.lower().endswith(".pdf"):
                print(f"Detected PDF content via Content-Type or file extension at URL: {url}")
                pdf_content = await extract_pdf_text(r.content)
                return pdf_content
            else:
                return r.content
        else:
            print(f"Non-200 response for URL: {url}, status code: {r.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching URL: {url}, Error: {str(e)}")
        return None


async def extract_pdf_text(content):
    try:
        with fitz.open(stream=content, filetype="pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <body>
            <p>{text}</p>
        </body>
        </html>
        """
        html_bytes = html_content.encode('utf-8')
        return html_bytes # Return in such a format that is parsable by trafilatura
    except Exception as e:
        print(f"Error extracting PDF text: {str(e)}")
        return None


async def parallel_scrap(urls):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    async with httpx.AsyncClient(timeout=30, headers=headers) as client:
        tasks = []
        for url in urls:
            tasks.append(get_url_data(url=url, client=client))
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


def scrap(urls):
    client = httpx.Client()
    soups = []
    for url in urls:
        soups.append(get_url_data(url=url, client=client))
    return soups


def google_search_urls(
    text,
    sorted_date,
    domains_to_include,
    api_key,
    cse_id,
    num_results=10,  # Number of results to fetch per page
    total_results=30,  # Total number of results to fetch
    skip_urls=None,  # List of URLs to skip
    **kwargs,
):
    if skip_urls is None:
        skip_urls = []  # Initialize as empty list if not provided
    
    service = build("customsearch", "v1", developerKey=api_key)
    url_list = []
    start_index = 1  # Initial index for the search results
    while len(url_list) < total_results:
        # Fetch a page of results
        results = service.cse().list(
            q=text,
            cx=cse_id,
            sort=sorted_date,
            start=start_index,
            num=min(num_results, total_results - len(url_list)),
            **kwargs
        ).execute()
        
        if "items" in results and len(results["items"]) > 0:
            for count, link in enumerate(results["items"]):
                url = link["link"]
                # Skip if the URL is in the skip_urls list or doesn't match the domain filter
                if url in skip_urls:
                    continue
                if (domains_to_include is None) or any(
                    ("." + domain) in url for domain in domains_to_include
                ):
                    if url not in url_list:
                        url_list.append(url)
        else:
            # No more results
            break
        
        # Move to the next page of results
        start_index += num_results
        
    return url_list[:total_results]


def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
    api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
    cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
    start_time = time.perf_counter()
    if scholar_mode_check:
        topic += " -filetype:pdf"
    url_list = google_search_urls(
        topic,
        sorted_date,
        domains_to_include,
        api_key,
        cse_id,
    )
    print("---")
    print(len(url_list))
    print(url_list)
    print("---")
    print("Google Search processing time: ", time.perf_counter() - start_time)
    result_content = build_results_beautifulsoup(url_list)
    return result_content


res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
print(res.keys())
print(len(res))