from import Service as ChromeService from import ChromeDriverManager from import By from googlesearch import search from selenium import webdriver from bs4 import BeautifulSoup import trafilatura import requests import urllib def setup_driver(headless=True): chrome_options = webdriver.ChromeOptions() if headless: chrome_options.add_argument('headless') chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) driver = webdriver.Chrome(service=ChromeService( ChromeDriverManager().install()), options=chrome_options) return driver def scrape_article(url): downloaded = trafilatura.fetch_url(url) extracted = trafilatura.extract(downloaded) return extracted def get_articles(urls, deep=False): articles = [] for article in urls: article_text = scrape_article(article) if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text: if deep: articles.append( ".".join(article_text[:-1].split('.')[:-1]).split("|")[-1]) else: articles.append( ".".join(article_text[:2000].split('.')[:-1]).split("|")[-1]) if len(articles) == 2: break return articles def web_scraper(web_query, deep=False): driver = setup_driver() urls = list(search(web_query, num_results=10, sleep_interval=0.1)) web_query = urllib.parse.quote(web_query) url = ""+web_query driver.get(url) source = requests.get(url).text soup = BeautifulSoup(source, 'html.parser') part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition', 'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner'] list1 = [] articles = get_articles(urls, deep) for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'): for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'): list1.append(j.text) try: top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd') except: pass if not top_result_element: try: top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc") except: pass if not top_result_element: try: top_result_element = driver.find_element( By.CLASS_NAME, "Z0LcW CfV8xf") except: pass if not top_result_element: try: top_result_element = driver.find_element( By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e") except: pass top_result = top_result_element.text if top_result_element else None try: if list1[0].split()[0] in part_of_speeches: pos = list1[0].split()[0] if pos[0] == "a": top_result += f'As an {pos} it means {list1[1]}' else: top_result += f'As a {pos} it means {list1[1]}' except: pass try: if not top_result: for text in list1: list_text = text.split() if len(list_text) != 0 and list_text[-1] == 'Wikipedia': top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}' except: pass try: if "" in urls[0]: driver.get(urls[0]) transcript_elements = driver.find_elements( By.CLASS_NAME, "ytd-transcript-segment-renderer") transcript = "\n".join( [element for element in transcript_elements]) if transcript: top_result = transcript except: pass driver.quit() article_text = "" for index, article in enumerate(articles): article_text += f"Article {index+1}: {article}\n" return f"Top Results: {top_result}\n{article_text}" def get_weather_data(): driver = setup_driver() driver.get('') weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd') weather_data = weather_data.text data_list = weather_data.split('\n') data_list[0] = data_list[0][0:-2] data_list.append(driver.find_element(By.ID, 'wob_dc').text) location = driver.find_element(By.CLASS_NAME, "eKPi4").text location = location.replace("Results for\n", "") weather_icon_link = driver.find_element( By.ID, 'wob_tci').get_attribute('src') url = weather_icon_link with urllib.request.urlopen(url) as url1: weather_data = temp = data_list[0] weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}' weather_name = data_list[-1] print( f'Weather in {location} is: {temp}, {weather_details}, {weather_name}') driver.quit() return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name} if __name__ == "__main__": data = get_weather_data() location = data["location"] temperature = data["temperature"] details = data["details"] name = data["name"] weather = f"{location} is {name} with {temperature} and {details}" print(weather) print(web_scraper("top news"))