Spaces:

tensora
/

webcrawler

Running

File size: 3,857 Bytes

7d3cbcd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
import time

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
complete_starttime = time.time()

# URL of the Google Trends page

def setup_driver():
    """Set up the Selenium WebDriver."""
    script_dir = os.path.dirname(os.path.abspath(__file__))
    driver_path = os.path.join(script_dir, 'chromedriver.exe')
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
    return driver

def process_selenium_row(index, selenium_rows, driver):
    """Extract dynamic data using Selenium by clicking on the row."""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            row = selenium_rows[index]
            row.click()

            # Wait for elements with class="xZCHj" to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
            )

            links = driver.find_elements(By.CLASS_NAME, "xZCHj")
            dynamic_data = {
                "article": [
                    {
                        "href": link.get_attribute("href"),
                        "title": link.text
                    }
                    for link in links
                ]
            }

            if dynamic_data["article"]:
                return dynamic_data
        except Exception as e:
            print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')

    print(f"Failed to process row {index} after {max_retries} attempts.")
    return {"article": []}

def scrape_google_trends(driver, url):
    """Scrape Google Trends data and save to JSON."""
    all_data = []

    try:
        driver.get(url)

        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
        )

        soup = BeautifulSoup(driver.page_source, "html.parser")
        tables = soup.select('[jsname="cC57zf"]')

        for table in tables:
            rows_bs = table.find_all("tr")
            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')

            for index, row_bs in enumerate(rows_bs):
                static_data = [
                    [div.get_text(strip=True) for div in cell.find_all("div")]
                    for cell in row_bs.find_all("td")[1:4]
                ]
                print(static_data)
                dynamic_data = process_selenium_row(index, selenium_rows, driver)
                combined_row = {
                    "static_data": static_data,
                    "dynamic_data": dynamic_data
                }
                all_data.append(combined_row)

        return all_data

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        driver.quit()



def crawl_url(url):
    """Main function to be called from another script."""
    driver = setup_driver()
    return scrape_google_trends(driver, url)

if __name__ == "__main__":
    #crawl_url(url="https://trends.google.com/trends/trendingsearches/daily?geo=AT&category=2")
    driver = setup_driver()