Thamed-Chowdhury commited on
Commit
8ca1efe
·
verified ·
1 Parent(s): 948ae5f

Update Prothom_alo_fully_scraped.py

Browse files
Files changed (1) hide show
  1. Prothom_alo_fully_scraped.py +86 -80
Prothom_alo_fully_scraped.py CHANGED
@@ -1,80 +1,86 @@
1
- def get_data(number):
2
- print("Running Prothom_alo_fully_scraped")
3
- ##Necessary imports
4
- from selenium import webdriver
5
- from selenium.webdriver import chrome
6
- from selenium.webdriver import ChromeOptions
7
- options = ChromeOptions()
8
- options.add_argument("--headless=new")
9
- driver = webdriver.Chrome(options=options)
10
- ## Finding Elements by XPATH
11
- from selenium.webdriver.common.by import By
12
-
13
- driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)
14
-
15
- import time
16
- news_list=[]
17
- news_link=[]
18
- l=0
19
- for i in range(number):
20
- if i<15:
21
-
22
- txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
23
- news_list.append(txt.text)
24
- news_link.append(txt.get_attribute("href"))
25
- else:
26
- if (i-15)%10==0:
27
- time.sleep(5)
28
- last_height = driver.execute_script("return document.body.scrollHeight")
29
- driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
30
- try:
31
-
32
- driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
33
- except:
34
- l=1
35
- if l==1:
36
- time.sleep(5)
37
- try:
38
- driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
39
- except:
40
- time.sleep(5)
41
- driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
42
- l=0
43
- time.sleep(5)
44
- txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
45
- news_list.append(txt.text)
46
- news_link.append(txt.get_attribute("href"))
47
-
48
- ###### Scraping Publish Date and Description ######
49
-
50
- publish_date=[]
51
- text=[]
52
- for i in range (len(news_link)):
53
- driver.get(news_link[i])
54
- try:
55
- publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
56
- tmp=""
57
- elements = driver.find_elements(By.TAG_NAME, 'p')
58
- for e in elements:
59
- tmp=tmp+e.text
60
- text.append(tmp)
61
- except:
62
- publish_date.append("Not available")
63
- text.append("Not Available")
64
- time.sleep(3)
65
-
66
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
67
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
68
- import pandas as pd
69
- df=pd.DataFrame(dict)
70
- df2=df.copy()
71
-
72
-
73
- for p in range(len(df2)):
74
- if df2['Publish Date'][p]=="Not available":
75
- df2.drop([p],inplace=True)
76
- #df2.reset_index()
77
- df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
78
- df2.reset_index(drop=True,inplace=True)
79
- return df2
80
- #df3.to_csv('Prothom_Alo_Description.txt', index=False)
 
 
 
 
 
 
 
1
+ def get_data(number):
2
+ print("Running Prothom_alo_fully_scraped")
3
+ ##Necessary imports
4
+ from selenium import webdriver
5
+ from selenium.webdriver import chrome
6
+ from selenium.webdriver import ChromeOptions
7
+ options = ChromeOptions()
8
+ options.add_argument("enable-automation");
9
+ options.add_argument("--window-size=1920,1080");
10
+ options.add_argument("--no-sandbox");
11
+ options.add_argument("--disable-extensions");
12
+ options.add_argument("--dns-prefetch-disable");
13
+ options.add_argument("--disable-gpu");
14
+ options.add_argument("--headless=new")
15
+ driver = webdriver.Chrome(options=options)
16
+ ## Finding Elements by XPATH
17
+ from selenium.webdriver.common.by import By
18
+
19
+ driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)
20
+
21
+ import time
22
+ news_list=[]
23
+ news_link=[]
24
+ l=0
25
+ for i in range(number):
26
+ if i<15:
27
+
28
+ txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
29
+ news_list.append(txt.text)
30
+ news_link.append(txt.get_attribute("href"))
31
+ else:
32
+ if (i-15)%10==0:
33
+ time.sleep(5)
34
+ last_height = driver.execute_script("return document.body.scrollHeight")
35
+ driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
36
+ try:
37
+
38
+ driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
39
+ except:
40
+ l=1
41
+ if l==1:
42
+ time.sleep(5)
43
+ try:
44
+ driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
45
+ except:
46
+ time.sleep(5)
47
+ driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
48
+ l=0
49
+ time.sleep(5)
50
+ txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
51
+ news_list.append(txt.text)
52
+ news_link.append(txt.get_attribute("href"))
53
+
54
+ ###### Scraping Publish Date and Description ######
55
+
56
+ publish_date=[]
57
+ text=[]
58
+ for i in range (len(news_link)):
59
+ driver.get(news_link[i])
60
+ try:
61
+ publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
62
+ tmp=""
63
+ elements = driver.find_elements(By.TAG_NAME, 'p')
64
+ for e in elements:
65
+ tmp=tmp+e.text
66
+ text.append(tmp)
67
+ except:
68
+ publish_date.append("Not available")
69
+ text.append("Not Available")
70
+ time.sleep(3)
71
+
72
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
73
+ dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
74
+ import pandas as pd
75
+ df=pd.DataFrame(dict)
76
+ df2=df.copy()
77
+
78
+
79
+ for p in range(len(df2)):
80
+ if df2['Publish Date'][p]=="Not available":
81
+ df2.drop([p],inplace=True)
82
+ #df2.reset_index()
83
+ df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
84
+ df2.reset_index(drop=True,inplace=True)
85
+ return df2
86
+ #df3.to_csv('Prothom_Alo_Description.txt', index=False)