Thamed-Chowdhury commited on
Commit
948ae5f
·
verified ·
1 Parent(s): 2e5199f

Update Daily_Star_fully_scraped.py

Browse files
Files changed (1) hide show
  1. Daily_Star_fully_scraped.py +85 -79
Daily_Star_fully_scraped.py CHANGED
@@ -1,79 +1,85 @@
1
- def get_data(number):
2
-
3
-
4
- ##Necessary imports
5
- from selenium import webdriver
6
- from selenium.webdriver import chrome
7
- from selenium.webdriver import ChromeOptions
8
- options = ChromeOptions()
9
- options.add_argument("--headless=new")
10
- driver = webdriver.Chrome(options=options)
11
- ## Finding Elements by XPATH
12
- from selenium.webdriver.common.by import By
13
-
14
- driver.get("https://www.thedailystar.net/tags/road-accident")
15
-
16
- """
17
- Browsing with browser open codes:
18
- ##Necessary imports
19
- from selenium import webdriver
20
- from selenium.webdriver import chrome
21
-
22
- driver = webdriver.Chrome()
23
- ## Finding Elements by XPATH
24
- from selenium.webdriver.common.by import By
25
- driver.get("https://en.prothomalo.com/topic/Road-accident")
26
- """
27
- import time
28
- news_list=[]
29
- news_link=[]
30
- publish_date=[]
31
- for i in range(number):
32
- #time.sleep(5)
33
- if (i+1)!=0 and (i+1)%10==0:
34
- last_height = driver.execute_script("return document.body.scrollHeight")
35
- driver.execute_script(f"window.scrollTo(0, {last_height-950})")
36
- driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[2]/ul/li/a').click()
37
- time.sleep(10)
38
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/div[2]/div[2]/h3/a')
39
- publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/div[1]').text)
40
- news_list.append(txt.text)
41
- news_link.append(txt.get_attribute("href"))
42
-
43
-
44
-
45
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
46
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
47
- import pandas as pd
48
- df=pd.DataFrame(dict)
49
-
50
-
51
- ############################################### Description Exctraction #################################################
52
- from newspaper import Article
53
-
54
-
55
- text=[]
56
- for i in range(len(df)):
57
- url = df['News Link'][i]
58
- article = Article(url)
59
- article.download()
60
- article.parse()
61
-
62
- text.append(article.text)
63
-
64
- df2=df.assign(Description=text)
65
-
66
-
67
- for p in range(len(df2)):
68
- if df2['Publish Date'][p]=="Not available":
69
- df2.drop([p],inplace=True)
70
- #df2.reset_index()
71
-
72
- df2.reset_index(drop=True,inplace=True)
73
-
74
- df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
75
-
76
-
77
-
78
- return df2
79
- #df3.to_csv('Prothom_Alo_Description.txt', index=False)
 
 
 
 
 
 
 
1
+ def get_data(number):
2
+
3
+
4
+ ##Necessary imports
5
+ from selenium import webdriver
6
+ from selenium.webdriver import chrome
7
+ from selenium.webdriver import ChromeOptions
8
+ options = ChromeOptions()
9
+ options.add_argument("enable-automation");
10
+ options.add_argument("--window-size=1920,1080");
11
+ options.add_argument("--no-sandbox");
12
+ options.add_argument("--disable-extensions");
13
+ options.add_argument("--dns-prefetch-disable");
14
+ options.add_argument("--disable-gpu");
15
+ options.add_argument("--headless=new")
16
+ driver = webdriver.Chrome(options=options)
17
+ ## Finding Elements by XPATH
18
+ from selenium.webdriver.common.by import By
19
+
20
+ driver.get("https://www.thedailystar.net/tags/road-accident")
21
+
22
+ """
23
+ Browsing with browser open codes:
24
+ ##Necessary imports
25
+ from selenium import webdriver
26
+ from selenium.webdriver import chrome
27
+
28
+ driver = webdriver.Chrome()
29
+ ## Finding Elements by XPATH
30
+ from selenium.webdriver.common.by import By
31
+ driver.get("https://en.prothomalo.com/topic/Road-accident")
32
+ """
33
+ import time
34
+ news_list=[]
35
+ news_link=[]
36
+ publish_date=[]
37
+ for i in range(number):
38
+ #time.sleep(5)
39
+ if (i+1)!=0 and (i+1)%10==0:
40
+ last_height = driver.execute_script("return document.body.scrollHeight")
41
+ driver.execute_script(f"window.scrollTo(0, {last_height-950})")
42
+ driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[2]/ul/li/a').click()
43
+ time.sleep(10)
44
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/div[2]/div[2]/h3/a')
45
+ publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/div[1]').text)
46
+ news_list.append(txt.text)
47
+ news_link.append(txt.get_attribute("href"))
48
+
49
+
50
+
51
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
52
+ dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
53
+ import pandas as pd
54
+ df=pd.DataFrame(dict)
55
+
56
+
57
+ ############################################### Description Exctraction #################################################
58
+ from newspaper import Article
59
+
60
+
61
+ text=[]
62
+ for i in range(len(df)):
63
+ url = df['News Link'][i]
64
+ article = Article(url)
65
+ article.download()
66
+ article.parse()
67
+
68
+ text.append(article.text)
69
+
70
+ df2=df.assign(Description=text)
71
+
72
+
73
+ for p in range(len(df2)):
74
+ if df2['Publish Date'][p]=="Not available":
75
+ df2.drop([p],inplace=True)
76
+ #df2.reset_index()
77
+
78
+ df2.reset_index(drop=True,inplace=True)
79
+
80
+ df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
81
+
82
+
83
+
84
+ return df2
85
+ #df3.to_csv('Prothom_Alo_Description.txt', index=False)