|
def get_data(number): |
|
print("Running Daily_Star_Fully_Scraped") |
|
|
|
from selenium import webdriver |
|
from selenium.webdriver import chrome |
|
from selenium.webdriver import ChromeOptions |
|
options = ChromeOptions() |
|
options.add_argument("enable-automation") |
|
options.add_argument("--window-size=1920,1080") |
|
options.add_argument("--no-sandbox") |
|
options.add_argument("--disable-extensions") |
|
options.add_argument("--dns-prefetch-disable") |
|
options.add_argument("--disable-gpu") |
|
options.add_argument("--headless=new") |
|
driver = webdriver.Chrome(options=options) |
|
|
|
driver.set_page_load_timeout(10) |
|
|
|
|
|
from selenium.webdriver.common.by import By |
|
driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires") |
|
|
|
import time |
|
news_list=[] |
|
news_link=[] |
|
for i in range(2,10): |
|
txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a') |
|
news_list.append(txt.text) |
|
news_link.append(txt.get_attribute("href")) |
|
|
|
number2=number-8 |
|
import time |
|
if number2>0: |
|
for i in range(number2): |
|
|
|
if (i+1)!=0 and (i+1)%10==0: |
|
last_height = driver.execute_script("return document.body.scrollHeight") |
|
driver.execute_script(f"window.scrollTo(0, {last_height-950})") |
|
driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click() |
|
time.sleep(10) |
|
txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a') |
|
news_list.append(txt.text) |
|
news_link.append(txt.get_attribute("href")) |
|
|
|
for i in range(len(news_link)): |
|
from deep_translator import GoogleTranslator |
|
from goose3 import Goose |
|
from datetime import datetime |
|
g = Goose() |
|
description=[] |
|
News_title=[] |
|
publish_date=[] |
|
for i in range(len(news_link)): |
|
article = g.extract(url=news_link[i]) |
|
News_title.append(article.title) |
|
description.append(article.cleaned_text) |
|
publish_date.append(article.publish_date) |
|
|
|
formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date] |
|
|
|
|
|
dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description} |
|
import pandas as pd |
|
df=pd.DataFrame(dict) |
|
return df |
|
|