article_writer / plagiarism.py
minko186's picture
add google search and updated prompt
70d74f0
raw
history blame
3.1 kB
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from bs4 import BeautifulSoup
months = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12",
}
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
def build_date(year=2024, month="March", day=1):
return f"{year}{months[month]}{day}"
async def get_url_data(url, client):
try:
r = await client.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.content, "html.parser")
return soup
except Exception:
return None
async def parallel_scrap(urls):
async with httpx.AsyncClient(timeout=30) as client:
tasks = []
for url in urls:
tasks.append(get_url_data(url=url, client=client))
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
def google_search_urls(
text,
sorted_date,
domains_to_skip,
api_key,
cse_id,
**kwargs,
):
service = build("customsearch", "v1", developerKey=api_key)
num_pages = 5
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
url_list = []
if "items" in results and len(results["items"]) > 0:
for count, link in enumerate(results["items"]):
if count >= num_pages:
break
# skip user selected domains
if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
continue
url = link["link"]
if url not in url_list:
url_list.append(url)
return url_list
def google_search(
input,
sorted_date,
domains_to_skip,
):
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
cse_id = "851813e81162b4ed4"
# get list of URLS to check
start_time = time.perf_counter()
url_list = google_search_urls(
input,
sorted_date,
domains_to_skip,
api_key,
cse_id,
)
print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
# Scrape URLs in list
start_time = time.perf_counter()
soups = asyncio.run(parallel_scrap(url_list))
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
result_content = {}
for url, soup in zip(url_list, soups):
if soup:
result_content[url] = soup.text
# for key, value in result_content.items():
# print("-------------------URL: ", key)
# print(value[:30])
return result_content