Thamed-Chowdhury commited on
Commit
701b049
·
verified ·
1 Parent(s): 7c00c8d

Update Daily_Star_fully_scraped.py

Browse files
Files changed (1) hide show
  1. Daily_Star_fully_scraped.py +65 -65
Daily_Star_fully_scraped.py CHANGED
@@ -1,66 +1,66 @@
1
- def get_data(number):
2
- print("Running Daily_Star_Fully_Scraped")
3
- ##Necessary imports
4
- from selenium import webdriver
5
- from selenium.webdriver import chrome
6
- from selenium.webdriver import ChromeOptions
7
- options = ChromeOptions()
8
- options.add_argument("enable-automation")
9
- options.add_argument("--window-size=1920,1080")
10
- options.add_argument("--no-sandbox")
11
- options.add_argument("--disable-extensions")
12
- options.add_argument("--dns-prefetch-disable")
13
- options.add_argument("--disable-gpu")
14
- # options.add_argument("--headless=new")
15
- driver = webdriver.Chrome(options=options)
16
- # Set a timeout for the page to load (in seconds)
17
- driver.set_page_load_timeout(10) # Limit page loading time to 10 seconds
18
-
19
- ## Finding Elements by XPATH
20
- from selenium.webdriver.common.by import By
21
- driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
22
- ### Extracting first 8 news seperately
23
- import time
24
- news_list=[]
25
- news_link=[]
26
- for i in range(2,10):
27
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
28
- news_list.append(txt.text)
29
- news_link.append(txt.get_attribute("href"))
30
- # Rest of the News_title and news link extraction
31
- number2=number-8
32
- import time
33
- if number2>0:
34
- for i in range(number2):
35
- #time.sleep(5)
36
- if (i+1)!=0 and (i+1)%10==0:
37
- last_height = driver.execute_script("return document.body.scrollHeight")
38
- driver.execute_script(f"window.scrollTo(0, {last_height-950})")
39
- driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
40
- time.sleep(10)
41
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
42
- news_list.append(txt.text)
43
- news_link.append(txt.get_attribute("href"))
44
- # Goose3 extraction
45
- for i in range(len(news_link)):
46
- from deep_translator import GoogleTranslator
47
- from goose3 import Goose
48
- from datetime import datetime
49
- g = Goose()
50
- description=[]
51
- News_title=[]
52
- publish_date=[]
53
- for i in range(len(news_link)):
54
- article = g.extract(url=news_link[i])
55
- News_title.append(article.title)
56
- description.append(article.cleaned_text)
57
- publish_date.append(article.publish_date)
58
- # Convert the dates to "day-month-year" format
59
- formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
60
-
61
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
62
- dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
63
- import pandas as pd
64
- df=pd.DataFrame(dict)
65
- return df
66
 
 
1
+ def get_data(number):
2
+ print("Running Daily_Star_Fully_Scraped")
3
+ ##Necessary imports
4
+ from selenium import webdriver
5
+ from selenium.webdriver import chrome
6
+ from selenium.webdriver import ChromeOptions
7
+ options = ChromeOptions()
8
+ options.add_argument("enable-automation")
9
+ options.add_argument("--window-size=1920,1080")
10
+ options.add_argument("--no-sandbox")
11
+ options.add_argument("--disable-extensions")
12
+ options.add_argument("--dns-prefetch-disable")
13
+ options.add_argument("--disable-gpu")
14
+ options.add_argument("--headless=new")
15
+ driver = webdriver.Chrome(options=options)
16
+ # Set a timeout for the page to load (in seconds)
17
+ driver.set_page_load_timeout(10) # Limit page loading time to 10 seconds
18
+
19
+ ## Finding Elements by XPATH
20
+ from selenium.webdriver.common.by import By
21
+ driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
22
+ ### Extracting first 8 news seperately
23
+ import time
24
+ news_list=[]
25
+ news_link=[]
26
+ for i in range(2,10):
27
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
28
+ news_list.append(txt.text)
29
+ news_link.append(txt.get_attribute("href"))
30
+ # Rest of the News_title and news link extraction
31
+ number2=number-8
32
+ import time
33
+ if number2>0:
34
+ for i in range(number2):
35
+ #time.sleep(5)
36
+ if (i+1)!=0 and (i+1)%10==0:
37
+ last_height = driver.execute_script("return document.body.scrollHeight")
38
+ driver.execute_script(f"window.scrollTo(0, {last_height-950})")
39
+ driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
40
+ time.sleep(10)
41
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
42
+ news_list.append(txt.text)
43
+ news_link.append(txt.get_attribute("href"))
44
+ # Goose3 extraction
45
+ for i in range(len(news_link)):
46
+ from deep_translator import GoogleTranslator
47
+ from goose3 import Goose
48
+ from datetime import datetime
49
+ g = Goose()
50
+ description=[]
51
+ News_title=[]
52
+ publish_date=[]
53
+ for i in range(len(news_link)):
54
+ article = g.extract(url=news_link[i])
55
+ News_title.append(article.title)
56
+ description.append(article.cleaned_text)
57
+ publish_date.append(article.publish_date)
58
+ # Convert the dates to "day-month-year" format
59
+ formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
60
+
61
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
62
+ dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
63
+ import pandas as pd
64
+ df=pd.DataFrame(dict)
65
+ return df
66