File size: 3,097 Bytes
70d74f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from bs4 import BeautifulSoup


months = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12",
}

domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]


def build_date(year=2024, month="March", day=1):
    return f"{year}{months[month]}{day}"


async def get_url_data(url, client):
    try:
        r = await client.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "html.parser")
            return soup
    except Exception:
        return None


async def parallel_scrap(urls):
    async with httpx.AsyncClient(timeout=30) as client:
        tasks = []
        for url in urls:
            tasks.append(get_url_data(url=url, client=client))
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


def google_search_urls(
    text,
    sorted_date,
    domains_to_skip,
    api_key,
    cse_id,
    **kwargs,
):
    service = build("customsearch", "v1", developerKey=api_key)
    num_pages = 5
    results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
    url_list = []
    if "items" in results and len(results["items"]) > 0:
        for count, link in enumerate(results["items"]):
            if count >= num_pages:
                break
            # skip user selected domains
            if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
                continue
            url = link["link"]
            if url not in url_list:
                url_list.append(url)
    return url_list


def google_search(
    input,
    sorted_date,
    domains_to_skip,
):
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
    # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
    # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
    cse_id = "851813e81162b4ed4"

    # get list of URLS to check
    start_time = time.perf_counter()
    url_list = google_search_urls(
        input,
        sorted_date,
        domains_to_skip,
        api_key,
        cse_id,
    )
    print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
    # Scrape URLs in list
    start_time = time.perf_counter()
    soups = asyncio.run(parallel_scrap(url_list))
    print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
    result_content = {}
    for url, soup in zip(url_list, soups):
        if soup:
            result_content[url] = soup.text
    # for key, value in result_content.items():
    #     print("-------------------URL: ", key)
    #     print(value[:30])
    return result_content