Spaces:
Running
Running
File size: 3,857 Bytes
7d3cbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
import time
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--disable-gpu")
complete_starttime = time.time()
# URL of the Google Trends page
def setup_driver():
"""Set up the Selenium WebDriver."""
script_dir = os.path.dirname(os.path.abspath(__file__))
driver_path = os.path.join(script_dir, 'chromedriver.exe')
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
return driver
def process_selenium_row(index, selenium_rows, driver):
"""Extract dynamic data using Selenium by clicking on the row."""
max_retries = 3
for attempt in range(max_retries):
try:
row = selenium_rows[index]
row.click()
# Wait for elements with class="xZCHj" to load
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
)
links = driver.find_elements(By.CLASS_NAME, "xZCHj")
dynamic_data = {
"article": [
{
"href": link.get_attribute("href"),
"title": link.text
}
for link in links
]
}
if dynamic_data["article"]:
return dynamic_data
except Exception as e:
print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
print(f"Failed to process row {index} after {max_retries} attempts.")
return {"article": []}
def scrape_google_trends(driver, url):
"""Scrape Google Trends data and save to JSON."""
all_data = []
try:
driver.get(url)
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
)
soup = BeautifulSoup(driver.page_source, "html.parser")
tables = soup.select('[jsname="cC57zf"]')
for table in tables:
rows_bs = table.find_all("tr")
selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
for index, row_bs in enumerate(rows_bs):
static_data = [
[div.get_text(strip=True) for div in cell.find_all("div")]
for cell in row_bs.find_all("td")[1:4]
]
print(static_data)
dynamic_data = process_selenium_row(index, selenium_rows, driver)
combined_row = {
"static_data": static_data,
"dynamic_data": dynamic_data
}
all_data.append(combined_row)
return all_data
except Exception as e:
print(f"An error occurred: {e}")
finally:
driver.quit()
def crawl_url(url):
"""Main function to be called from another script."""
driver = setup_driver()
return scrape_google_trends(driver, url)
if __name__ == "__main__":
#crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
driver = setup_driver()
|