Thamed-Chowdhury commited on
Commit
15386dc
·
verified ·
1 Parent(s): 6ada8e5

Update Dhaka_Tribune_Fully_Scraped.py

Browse files
Files changed (1) hide show
  1. Dhaka_Tribune_Fully_Scraped.py +82 -81
Dhaka_Tribune_Fully_Scraped.py CHANGED
@@ -1,82 +1,83 @@
1
- def get_data(number):
2
- ##Necessary imports
3
- from selenium import webdriver
4
- from selenium.webdriver import chrome
5
- from selenium.webdriver import ChromeOptions
6
- import math
7
- options = ChromeOptions()
8
- options.add_argument("--headless=new")
9
- driver = webdriver.Chrome(options=options)
10
- ## Finding Elements by XPATH
11
- from selenium.webdriver.common.by import By
12
-
13
-
14
- driver.get("https://www.dhakatribune.com/topic/road-accident")
15
-
16
- #### Scraping News Title and News Link ####
17
- import time
18
- news_list=[]
19
- news_link=[]
20
- publish_date=[]
21
- row_counter=0
22
- news_counter=0
23
- for i in range(number):
24
- if i==0:
25
- row_counter=1
26
- else:
27
- row_counter=math.ceil(i/4)
28
- news_counter=i%4+1
29
- #time.sleep(5)
30
- if (i+1)!=0 and (i+1)%20==0:
31
- last_height = driver.execute_script("return document.body.scrollHeight")
32
- driver.execute_script(f"window.scrollTo(0, {last_height})")
33
- driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
34
- time.sleep(10)
35
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2/a')
36
- #publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
37
- news_list.append(txt.text)
38
- news_link.append(txt.get_attribute("href"))
39
-
40
- ###### Scraping Publish Date ######
41
- publish_date=[]
42
- for i in range (len(news_link)):
43
- driver.get(news_link[i])
44
- time.sleep(6)
45
- driver.execute_script("window.stop();")
46
- try:
47
- publish_date.append(driver.find_element('xpath','/html/body/div[3]/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/div/div/div/div/div/div[2]/div/div/div[3]/div/div[1]/div/div[2]/span[1]').text)
48
- except:
49
- publish_date.append("Not available")
50
-
51
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
52
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
53
- import pandas as pd
54
- df=pd.DataFrame(dict)
55
-
56
-
57
- ############################################ Description Extraction ###################################################
58
-
59
- from newspaper import Article
60
- text=[]
61
- for i in range(len(df)):
62
- url = df['News Link'][i]
63
- article = Article(url)
64
- article.download()
65
- article.parse()
66
-
67
- text.append(article.text)
68
-
69
-
70
- df2=df.assign(Description=text)
71
- for p in range(len(df2)):
72
- if df2['Publish Date'][p]=="Not available":
73
- df2.drop([p],inplace=True)
74
-
75
- df2.reset_index(drop=True,inplace=True)
76
- df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
77
-
78
-
79
-
80
- return df2
81
-
 
82
  #df3.to_csv('Dhaka_Tribune_Description.txt', index=False)
 
1
+ def get_data(number):
2
+ ##Necessary imports
3
+ from selenium import webdriver
4
+ from selenium.webdriver import chrome
5
+ from selenium.webdriver import ChromeOptions
6
+ import math
7
+ options = ChromeOptions()
8
+ options.add_argument("--headless=new")
9
+ driver = webdriver.Chrome(options=options)
10
+ ## Finding Elements by XPATH
11
+ from selenium.webdriver.common.by import By
12
+
13
+
14
+ driver.get("https://www.dhakatribune.com/topic/road-accident")
15
+
16
+ #### Scraping News Title and News Link ####
17
+ print(driver.current_url)
18
+ import time
19
+ news_list=[]
20
+ news_link=[]
21
+ publish_date=[]
22
+ row_counter=0
23
+ news_counter=0
24
+ for i in range(number):
25
+ if i==0:
26
+ row_counter=1
27
+ else:
28
+ row_counter=math.ceil(i/4)
29
+ news_counter=i%4+1
30
+ #time.sleep(5)
31
+ if (i+1)!=0 and (i+1)%20==0:
32
+ last_height = driver.execute_script("return document.body.scrollHeight")
33
+ driver.execute_script(f"window.scrollTo(0, {last_height})")
34
+ driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
35
+ time.sleep(10)
36
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2')
37
+ #publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
38
+ news_list.append(txt.text)
39
+ news_link.append(txt.get_attribute("href"))
40
+
41
+ ###### Scraping Publish Date ######
42
+ publish_date=[]
43
+ for i in range (len(news_link)):
44
+ driver.get(news_link[i])
45
+ time.sleep(6)
46
+ driver.execute_script("window.stop();")
47
+ try:
48
+ publish_date.append(driver.find_element('xpath','/html/body/div[3]/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/div/div/div/div/div/div[2]/div/div/div[3]/div/div[1]/div/div[2]/span[1]').text)
49
+ except:
50
+ publish_date.append("Not available")
51
+
52
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
53
+ dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
54
+ import pandas as pd
55
+ df=pd.DataFrame(dict)
56
+
57
+
58
+ ############################################ Description Extraction ###################################################
59
+
60
+ from newspaper import Article
61
+ text=[]
62
+ for i in range(len(df)):
63
+ url = df['News Link'][i]
64
+ article = Article(url)
65
+ article.download()
66
+ article.parse()
67
+
68
+ text.append(article.text)
69
+
70
+
71
+ df2=df.assign(Description=text)
72
+ for p in range(len(df2)):
73
+ if df2['Publish Date'][p]=="Not available":
74
+ df2.drop([p],inplace=True)
75
+
76
+ df2.reset_index(drop=True,inplace=True)
77
+ df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
78
+
79
+
80
+
81
+ return df2
82
+
83
  #df3.to_csv('Dhaka_Tribune_Description.txt', index=False)