Spaces:

Thamed-Chowdhury
/

Automated_Accident_Dataset

Sleeping

App Files Files Community

Automated_Accident_Dataset / Prothom_alo_fully_scraped.py

Thamed-Chowdhury

Update Prothom_alo_fully_scraped.py

8ca1efe verified 11 months ago

raw

history blame

3.47 kB

	def get_data(number):
	print("Running Prothom_alo_fully_scraped")
	##Necessary imports
	from selenium import webdriver
	from selenium.webdriver import chrome
	from selenium.webdriver import ChromeOptions
	options = ChromeOptions()
	options.add_argument("enable-automation");
	options.add_argument("--window-size=1920,1080");
	options.add_argument("--no-sandbox");
	options.add_argument("--disable-extensions");
	options.add_argument("--dns-prefetch-disable");
	options.add_argument("--disable-gpu");
	options.add_argument("--headless=new")
	driver = webdriver.Chrome(options=options)
	## Finding Elements by XPATH
	from selenium.webdriver.common.by import By

	driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)

	import time
	news_list=[]
	news_link=[]
	l=0
	for i in range(number):
	if i<15:

	txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
	news_list.append(txt.text)
	news_link.append(txt.get_attribute("href"))
	else:
	if (i-15)%10==0:
	time.sleep(5)
	last_height = driver.execute_script("return document.body.scrollHeight")
	driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
	try:

	driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
	except:
	l=1
	if l==1:
	time.sleep(5)
	try:
	driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
	except:
	time.sleep(5)
	driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
	l=0
	time.sleep(5)
	txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
	news_list.append(txt.text)
	news_link.append(txt.get_attribute("href"))

	###### Scraping Publish Date and Description ######

	publish_date=[]
	text=[]
	for i in range (len(news_link)):
	driver.get(news_link[i])
	try:
	publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
	tmp=""
	elements = driver.find_elements(By.TAG_NAME, 'p')
	for e in elements:
	tmp=tmp+e.text
	text.append(tmp)
	except:
	publish_date.append("Not available")
	text.append("Not Available")
	time.sleep(3)

	#### Converting the list to a pandas dataframe by converting the list to a dictionary ###
	dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
	import pandas as pd
	df=pd.DataFrame(dict)
	df2=df.copy()


	for p in range(len(df2)):
	if df2['Publish Date'][p]=="Not available":
	df2.drop([p],inplace=True)
	#df2.reset_index()
	df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
	df2.reset_index(drop=True,inplace=True)
	return df2
	#df3.to_csv('Prothom_Alo_Description.txt', index=False)