Spaces:

tensora
/

webcrawler

Running

App Files Files Community

Add1E commited on Jan 13

Commit

95b3aff

verified ·

1 Parent(s): f952acb

Update trend_crawl.py

Browse files

Files changed (1) hide show

trend_crawl.py +21 -14

trend_crawl.py CHANGED Viewed

@@ -31,37 +31,44 @@ def setup_driver():
 def process_selenium_row(index, selenium_rows, driver):
     """Extract dynamic data using Selenium by clicking on the row."""
-    max_retries = 5  # Increase retries
     for attempt in range(max_retries):
         try:
-            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')  # Refresh rows
             row = selenium_rows[index]
-            driver.execute_script("arguments[0].click();", row)  # Use JavaScript click
-            time.sleep(1)  # Adjust delay
-            # Wait for dynamic content
-            WebDriverWait(driver, 15).until(
                 EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
             )
-            links = driver.find_elements(By.CLASS_NAME, "xZCHj")
             dynamic_data = {
                 "article": [
                     {
-                        "href": link.get_attribute("href"),
-                        "title": link.text
                     }
-                    for link in links
                 ]
             }
-            if dynamic_data["article"]:
-                return dynamic_data
         except Exception as e:
             print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
-            time.sleep(2)  # Add delay before retry
     print(f"Failed to process row {index} after {max_retries} attempts.")
     return {"article": []}
 def scrape_google_trends(driver, url):
     """Scrape Google Trends data and save to JSON."""
     all_data = []

 def process_selenium_row(index, selenium_rows, driver):
     """Extract dynamic data using Selenium by clicking on the row."""
+    max_retries = 3
     for attempt in range(max_retries):
         try:
+            # Refresh the rows before processing
+            selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
             row = selenium_rows[index]
+            driver.execute_script("arguments[0].click();", row)  # Use JavaScript click for stability
+            # Wait for the articles to load dynamically
+            WebDriverWait(driver, 10).until(
                 EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
             )
+            # Fetch only the newly loaded articles
+            articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
+            # Extract data from the current row only
             dynamic_data = {
                 "article": [
                     {
+                        "href": article.get_attribute("href"),
+                        "title": article.text
                     }
+                    for article in articles
                 ]
             }
+            # Clear previously fetched articles and return current ones
+            return dynamic_data
         except Exception as e:
             print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
+            time.sleep(1)  # Add delay before retry
     print(f"Failed to process row {index} after {max_retries} attempts.")
     return {"article": []}
 def scrape_google_trends(driver, url):
     """Scrape Google Trends data and save to JSON."""
     all_data = []