Add1E commited on
Commit
95b3aff
·
verified ·
1 Parent(s): f952acb

Update trend_crawl.py

Browse files
Files changed (1) hide show
  1. trend_crawl.py +21 -14
trend_crawl.py CHANGED
@@ -31,37 +31,44 @@ def setup_driver():
31
 
32
  def process_selenium_row(index, selenium_rows, driver):
33
  """Extract dynamic data using Selenium by clicking on the row."""
34
- max_retries = 5 # Increase retries
35
  for attempt in range(max_retries):
36
  try:
37
- selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]') # Refresh rows
 
38
  row = selenium_rows[index]
39
- driver.execute_script("arguments[0].click();", row) # Use JavaScript click
40
- time.sleep(1) # Adjust delay
41
-
42
- # Wait for dynamic content
43
- WebDriverWait(driver, 15).until(
44
  EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
45
  )
46
- links = driver.find_elements(By.CLASS_NAME, "xZCHj")
 
 
 
 
47
  dynamic_data = {
48
  "article": [
49
  {
50
- "href": link.get_attribute("href"),
51
- "title": link.text
52
  }
53
- for link in links
54
  ]
55
  }
56
- if dynamic_data["article"]:
57
- return dynamic_data
 
 
58
  except Exception as e:
59
  print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
60
- time.sleep(2) # Add delay before retry
61
 
62
  print(f"Failed to process row {index} after {max_retries} attempts.")
63
  return {"article": []}
64
 
 
65
  def scrape_google_trends(driver, url):
66
  """Scrape Google Trends data and save to JSON."""
67
  all_data = []
 
31
 
32
  def process_selenium_row(index, selenium_rows, driver):
33
  """Extract dynamic data using Selenium by clicking on the row."""
34
+ max_retries = 3
35
  for attempt in range(max_retries):
36
  try:
37
+ # Refresh the rows before processing
38
+ selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
39
  row = selenium_rows[index]
40
+ driver.execute_script("arguments[0].click();", row) # Use JavaScript click for stability
41
+
42
+ # Wait for the articles to load dynamically
43
+ WebDriverWait(driver, 10).until(
 
44
  EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
45
  )
46
+
47
+ # Fetch only the newly loaded articles
48
+ articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
49
+
50
+ # Extract data from the current row only
51
  dynamic_data = {
52
  "article": [
53
  {
54
+ "href": article.get_attribute("href"),
55
+ "title": article.text
56
  }
57
+ for article in articles
58
  ]
59
  }
60
+
61
+ # Clear previously fetched articles and return current ones
62
+ return dynamic_data
63
+
64
  except Exception as e:
65
  print(f"Error processing row {index} (Attempt {attempt + 1}): {e}")
66
+ time.sleep(1) # Add delay before retry
67
 
68
  print(f"Failed to process row {index} after {max_retries} attempts.")
69
  return {"article": []}
70
 
71
+
72
  def scrape_google_trends(driver, url):
73
  """Scrape Google Trends data and save to JSON."""
74
  all_data = []