File size: 3,138 Bytes
0ecab8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
def get_data(number):
##Necessary imports
from selenium import webdriver
from selenium.webdriver import chrome
from selenium.webdriver import ChromeOptions
import math
options = ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
## Finding Elements by XPATH
from selenium.webdriver.common.by import By
driver.get("https://www.dhakatribune.com/topic/road-accident")
#### Scraping News Title and News Link ####
import time
news_list=[]
news_link=[]
publish_date=[]
row_counter=0
news_counter=0
for i in range(number):
if i==0:
row_counter=1
else:
row_counter=math.ceil(i/4)
news_counter=i%4+1
#time.sleep(5)
if (i+1)!=0 and (i+1)%20==0:
last_height = driver.execute_script("return document.body.scrollHeight")
driver.execute_script(f"window.scrollTo(0, {last_height})")
driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
time.sleep(10)
txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2/a')
#publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
news_list.append(txt.text)
news_link.append(txt.get_attribute("href"))
###### Scraping Publish Date ######
publish_date=[]
for i in range (len(news_link)):
driver.get(news_link[i])
time.sleep(6)
driver.execute_script("window.stop();")
try:
publish_date.append(driver.find_element('xpath','/html/body/div[3]/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/div/div/div/div/div/div[2]/div/div/div[3]/div/div[1]/div/div[2]/span[1]').text)
except:
publish_date.append("Not available")
#### Converting the list to a pandas dataframe by converting the list to a dictionary ###
dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
import pandas as pd
df=pd.DataFrame(dict)
############################################ Description Extraction ###################################################
from newspaper import Article
text=[]
for i in range(len(df)):
url = df['News Link'][i]
article = Article(url)
article.download()
article.parse()
text.append(article.text)
df2=df.assign(Description=text)
for p in range(len(df2)):
if df2['Publish Date'][p]=="Not available":
df2.drop([p],inplace=True)
df2.reset_index(drop=True,inplace=True)
df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
return df2
#df3.to_csv('Dhaka_Tribune_Description.txt', index=False) |