Add1E commited on
Commit
1140b38
·
verified ·
1 Parent(s): e0c925d

Delete trend_crawl2.py

Browse files
Files changed (1) hide show
  1. trend_crawl2.py +0 -220
trend_crawl2.py DELETED
@@ -1,220 +0,0 @@
1
- from selenium.common.exceptions import ElementClickInterceptedException
2
- from bs4 import BeautifulSoup
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from selenium import webdriver
5
- from selenium.webdriver.common.by import By
6
- from selenium.webdriver.chrome.options import Options
7
- from selenium.webdriver.support.ui import WebDriverWait
8
- from selenium.webdriver.support import expected_conditions as EC
9
- import json
10
- import time
11
-
12
- # Configure Chrome options
13
- def setup_driver():
14
- options = Options()
15
- options.add_argument("--headless")
16
- options.add_argument('--no-sandbox')
17
- options.add_argument('--disable-dev-shm-usage')
18
- options.add_argument("--lang=de")
19
- return webdriver.Chrome(options=options)
20
-
21
-
22
- def click_and_scrape(driver, url):
23
- """Click each li element and scrape data."""
24
- result_dict = {}
25
- try:
26
- driver.get(url)
27
-
28
- for attempt in range(4):
29
- try:
30
- button = WebDriverWait(driver, 20).until(
31
- EC.element_to_be_clickable((
32
- By.XPATH,
33
- "//button[@aria-label='Alle Kategorien, Kategorie auswählen']"
34
- ))
35
- )
36
- print("Button located.")
37
-
38
- driver.execute_script("arguments[0].scrollIntoView();", button)
39
- print(button.get_attribute("outerHTML"))
40
-
41
-
42
- button.click()
43
- print("Button clicked successfully.")
44
- break
45
- except ElementClickInterceptedException:
46
- print(f"Attempt {attempt + 1}: Click intercepted. Retrying...")
47
-
48
- try:
49
-
50
- ul_element = WebDriverWait(driver, 20).until(
51
- EC.presence_of_element_located((
52
- By.XPATH,
53
- "//ul[@aria-label='Kategorie']"
54
- ))
55
- )
56
- li_elements = ul_element.find_elements(By.TAG_NAME, "li")
57
- except Exception as e:
58
- print(f"Error locating ul_element: {e}")
59
- selected_elements = [li_elements[2]] + li_elements[4:]
60
- for index, li in enumerate(selected_elements):
61
- try:
62
-
63
- driver.execute_script("arguments[0].scrollIntoView();", li)
64
-
65
- driver.execute_script("arguments[0].click();", li)
66
- print(f"Clicked LI {index} using JavaScript.")
67
- time.sleep(2)
68
- try:
69
- span = li.find_element(By.CLASS_NAME, "W7g1Rb-rymPhb-fpDzbe-fmcmS")
70
- span_content = span.get_attribute("innerText")
71
- print(f"Extracted span content for LI {index}: {span_content}")
72
- data = scrape_google_trends(driver)
73
- result_dict[f"{span_content}"] = data
74
- except Exception as e:
75
- print(f"Could not find or extract span content in LI {index}: {e}")
76
- span_content = f"iteration_{index}"
77
- result_dict[f"{span_content}"] = []
78
-
79
- except Exception as e:
80
- print(f"Error interacting with LI {index}: {e}")
81
-
82
- except Exception as e:
83
- print(f"Error during click and scrape: {e}")
84
-
85
- finally:
86
- driver.quit()
87
-
88
- return result_dict
89
-
90
- def process_selenium_row(index, rows, driver):
91
- """Extract dynamic data using Selenium by clicking on the row."""
92
- max_retries = 3
93
- for attempt in range(max_retries):
94
- try:
95
- articles = {}
96
-
97
- driver.execute_script("arguments[0].click();", rows[index])
98
-
99
- WebDriverWait(driver, 10).until(
100
- EC.presence_of_all_elements_located((By.CLASS_NAME, "xZCHj"))
101
- )
102
-
103
-
104
- articles = driver.find_elements(By.CLASS_NAME, "xZCHj")
105
- articles = articles[:3]
106
- dynamic_data = {
107
- "article": [
108
- {
109
- "href": article.get_attribute("href"),
110
- "title": article.text
111
- }
112
- for article in articles
113
- ]
114
- }
115
-
116
- # Clear previously fetched articles and return current ones
117
- return dynamic_data
118
-
119
- except Exception as e:
120
- error = e
121
-
122
- print(f"Failed to process row {index} after {max_retries} attempts.")
123
- return {"article": []}
124
-
125
- def scrape_google_trends(driver):
126
- """Scrape data dynamically from the current page."""
127
- all_data = []
128
- try:
129
- selenium_rows = None
130
- WebDriverWait(driver, 2).until(
131
- EC.presence_of_element_located((By.CSS_SELECTOR, '[jsname="oKdM2c"]'))
132
- )
133
- soup = BeautifulSoup(driver.page_source, "html.parser")
134
- selenium_rows = driver.find_elements(By.CSS_SELECTOR, '[jsname="oKdM2c"]')
135
- tables = soup.select('[jsname="cC57zf"]')
136
-
137
- for table in tables:
138
- rows_bs = table.find_all("tr")
139
- for index, row_bs in enumerate(rows_bs):
140
- static_data = [
141
- [div.get_text(strip=True) for div in cell.find_all("div")]
142
- for cell in row_bs.find_all("td")[1:4]
143
- ]
144
- dynamic_data = process_selenium_row(index, selenium_rows, driver)
145
- combined_row = {
146
- "static_data": static_data,
147
- "dynamic_data": dynamic_data
148
- }
149
- all_data.append(combined_row)
150
-
151
- return all_data
152
-
153
- except Exception as e:
154
- with open(f"page_source_debug.html", "w", encoding="utf-8") as f:
155
- f.write(driver.page_source)
156
- print(f"An error occurred during scraping: {e}")
157
- return []
158
-
159
-
160
- def process_li_element(index, li_data, url):
161
- """Process a single li element."""
162
- driver = setup_driver()
163
- try:
164
- print("driver.get")
165
- driver.get(url)
166
- WebDriverWait(driver, 20).until(
167
- EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
168
- )
169
- print("1")
170
- ul_element = driver.find_element(By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]")
171
- print("2")
172
- li_elements = ul_element.find_elements(By.TAG_NAME, "li")
173
- print("2")
174
- selected_li = li_elements[li_data['index']]
175
- print("2")
176
- driver.execute_script("arguments[0].scrollIntoView();", selected_li)
177
- print("3")
178
- driver.execute_script("arguments[0].click();", selected_li)
179
- time.sleep(2)
180
-
181
- span_content = selected_li.find_element(By.CLASS_NAME, "W7g1Rb-rymPhb-fpDzbe-fmcmS").get_attribute("innerText")
182
- print("4")
183
- print(f"LI {li_data['index']} clicked: {span_content}")
184
-
185
- data = scrape_google_trends(driver)
186
- return {span_content: data}
187
- except Exception as e:
188
- print(f"Error processing LI {index}: {e}")
189
- return {}
190
- finally:
191
- driver.quit()
192
-
193
- def crawl_url(url):
194
- """Click each li element and scrape data in parallel."""
195
- driver = setup_driver()
196
- result_dict = {}
197
- try:
198
-
199
- driver.get(url)
200
- WebDriverWait(driver, 20).until(
201
- EC.presence_of_element_located((By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]"))
202
- )
203
-
204
- ul_element = driver.find_element(By.XPATH, "//ul[contains(@aria-label, 'Kategorie') or contains(@aria-label, 'Category')]")
205
- li_elements = ul_element.find_elements(By.TAG_NAME, "li")
206
- selected_elements = [{"index": i} for i in range(2, len(li_elements)) if i != 3]
207
-
208
- with ThreadPoolExecutor() as executor:
209
- futures = [executor.submit(process_li_element, idx, li_data, url) for idx, li_data in enumerate(selected_elements)]
210
- for future in as_completed(futures):
211
- result = future.result()
212
- result_dict.update(result)
213
- except Exception as e:
214
- print(f"Error during click and scrape: {e}")
215
- finally:
216
- driver.quit()
217
-
218
- return result_dict
219
-
220
-