Thamed-Chowdhury commited on
Commit
501a143
·
verified ·
1 Parent(s): ed2dff9

Update Daily_Star_fully_scraped.py

Browse files
Files changed (1) hide show
  1. Daily_Star_fully_scraped.py +92 -92
Daily_Star_fully_scraped.py CHANGED
@@ -1,92 +1,92 @@
1
- def get_data(number):
2
- print("Running Daily_Star_Fully_Scraped")
3
- ##Necessary imports
4
- from selenium import webdriver
5
- from selenium.webdriver import chrome
6
- from selenium.webdriver import ChromeOptions
7
- options = ChromeOptions()
8
- options.add_argument("enable-automation")
9
- options.add_argument("--window-size=1920,1080")
10
- options.add_argument("--no-sandbox")
11
- options.add_argument("--disable-extensions")
12
- options.add_argument("--dns-prefetch-disable")
13
- options.add_argument("--disable-gpu")
14
- # options.add_argument("--headless=new")
15
- driver = webdriver.Chrome(options=options)
16
- # Set a timeout for the page to load (in seconds)
17
- driver.set_page_load_timeout(10) # Limit page loading time to 10 seconds
18
-
19
- ## Finding Elements by XPATH
20
- from selenium.webdriver.common.by import By
21
- driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
22
- ### Extracting first 8 news seperately
23
- import time
24
- news_list=[]
25
- news_link=[]
26
- for i in range(2,10):
27
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
28
- news_list.append(txt.text)
29
- news_link.append(txt.get_attribute("href"))
30
- # Rest of the News_title and news link extraction
31
- number2=number-8
32
- import time
33
- if number2>0:
34
- for i in range(number2):
35
- #time.sleep(5)
36
- if (i+1)!=0 and (i+1)%10==0:
37
- last_height = driver.execute_script("return document.body.scrollHeight")
38
- driver.execute_script(f"window.scrollTo(0, {last_height-950})")
39
- driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
40
- time.sleep(10)
41
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
42
- news_list.append(txt.text)
43
- news_link.append(txt.get_attribute("href"))
44
- ###### Scraping Publish Date ######
45
-
46
- publish_date=[]
47
- for i in range (len(news_link)):
48
- try:
49
- driver.get(news_link[i])
50
- except:
51
- time.sleep(30)
52
- driver.get(news_link[i])
53
- time.sleep(3)
54
- driver.execute_script("window.stop();")
55
- try:
56
- publish_date.append(driver.find_element('xpath','/html/body/div[3]/div[2]/div/div/div[2]/main/div/div[2]/div[1]/div[2]/div/div[1]/div[1]/div/div/div[1]/div[2]/div[2]').text)
57
- except:
58
- publish_date.append("Not available")
59
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
60
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
61
- import pandas as pd
62
- df=pd.DataFrame(dict)
63
- if(number <=8 ):
64
- df = df.head(number)
65
-
66
- ############################################### Description Exctraction #################################################
67
- print('Description Extraction Started')
68
- from newspaper import Article
69
-
70
-
71
- text=[]
72
- for i in range(len(df)):
73
- url = df['News Link'][i]
74
- article = Article(url)
75
- article.download()
76
- article.parse()
77
-
78
- text.append(article.text)
79
-
80
- df2=df.assign(Description=text)
81
-
82
-
83
- for p in range(len(df2)):
84
- if df2['Publish Date'][p]=="Not available":
85
- df2.drop([p],inplace=True)
86
- #df2.reset_index()
87
-
88
- df2.reset_index(drop=True,inplace=True)
89
-
90
- df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
91
- return df2
92
-
 
1
+ def get_data(number):
2
+ print("Running Daily_Star_Fully_Scraped")
3
+ ##Necessary imports
4
+ from selenium import webdriver
5
+ from selenium.webdriver import chrome
6
+ from selenium.webdriver import ChromeOptions
7
+ options = ChromeOptions()
8
+ options.add_argument("enable-automation")
9
+ options.add_argument("--window-size=1920,1080")
10
+ options.add_argument("--no-sandbox")
11
+ options.add_argument("--disable-extensions")
12
+ options.add_argument("--dns-prefetch-disable")
13
+ options.add_argument("--disable-gpu")
14
+ options.add_argument("--headless=new")
15
+ driver = webdriver.Chrome(options=options)
16
+ # Set a timeout for the page to load (in seconds)
17
+ driver.set_page_load_timeout(10) # Limit page loading time to 10 seconds
18
+
19
+ ## Finding Elements by XPATH
20
+ from selenium.webdriver.common.by import By
21
+ driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
22
+ ### Extracting first 8 news seperately
23
+ import time
24
+ news_list=[]
25
+ news_link=[]
26
+ for i in range(2,10):
27
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
28
+ news_list.append(txt.text)
29
+ news_link.append(txt.get_attribute("href"))
30
+ # Rest of the News_title and news link extraction
31
+ number2=number-8
32
+ import time
33
+ if number2>0:
34
+ for i in range(number2):
35
+ #time.sleep(5)
36
+ if (i+1)!=0 and (i+1)%10==0:
37
+ last_height = driver.execute_script("return document.body.scrollHeight")
38
+ driver.execute_script(f"window.scrollTo(0, {last_height-950})")
39
+ driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
40
+ time.sleep(10)
41
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
42
+ news_list.append(txt.text)
43
+ news_link.append(txt.get_attribute("href"))
44
+ ###### Scraping Publish Date ######
45
+
46
+ publish_date=[]
47
+ for i in range (len(news_link)):
48
+ try:
49
+ driver.get(news_link[i])
50
+ except:
51
+ time.sleep(30)
52
+ driver.get(news_link[i])
53
+ time.sleep(3)
54
+ driver.execute_script("window.stop();")
55
+ try:
56
+ publish_date.append(driver.find_element('xpath','/html/body/div[3]/div[2]/div/div/div[2]/main/div/div[2]/div[1]/div[2]/div/div[1]/div[1]/div/div/div[1]/div[2]/div[2]').text)
57
+ except:
58
+ publish_date.append("Not available")
59
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
60
+ dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
61
+ import pandas as pd
62
+ df=pd.DataFrame(dict)
63
+ if(number <=8 ):
64
+ df = df.head(number)
65
+
66
+ ############################################### Description Exctraction #################################################
67
+ print('Description Extraction Started')
68
+ from newspaper import Article
69
+
70
+
71
+ text=[]
72
+ for i in range(len(df)):
73
+ url = df['News Link'][i]
74
+ article = Article(url)
75
+ article.download()
76
+ article.parse()
77
+
78
+ text.append(article.text)
79
+
80
+ df2=df.assign(Description=text)
81
+
82
+
83
+ for p in range(len(df2)):
84
+ if df2['Publish Date'][p]=="Not available":
85
+ df2.drop([p],inplace=True)
86
+ #df2.reset_index()
87
+
88
+ df2.reset_index(drop=True,inplace=True)
89
+
90
+ df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
91
+ return df2
92
+