Spaces:

Thamed-Chowdhury
/

Automated_Accident_Dataset

Sleeping

File size: 3,264 Bytes

0ecab8f

def get_data(number):
    print("Running Prothom_alo_fully_scraped")
    ##Necessary imports
    from selenium import webdriver
    from selenium.webdriver import chrome
    from selenium.webdriver import ChromeOptions
    options = ChromeOptions()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    ## Finding Elements by XPATH
    from selenium.webdriver.common.by import By

    driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)

    import time
    news_list=[]
    news_link=[]
    l=0
    for i in range(number):
        if i<15:
            
            txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
            news_list.append(txt.text)
            news_link.append(txt.get_attribute("href"))
        else:
            if (i-15)%10==0:
                time.sleep(5)
                last_height = driver.execute_script("return document.body.scrollHeight")
                driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
                try:
                    
                    driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
                except:
                    l=1
                if l==1:
                    time.sleep(5)
                    try:
                        driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
                    except:
                        time.sleep(5)
                        driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
                    l=0
                time.sleep(5)
            txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
            news_list.append(txt.text)
            news_link.append(txt.get_attribute("href"))   

    ###### Scraping Publish Date and Description ######

    publish_date=[]
    text=[]
    for i in range (len(news_link)):
        driver.get(news_link[i])
        try:
            publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
            tmp=""
            elements = driver.find_elements(By.TAG_NAME, 'p')
            for e in elements:
                tmp=tmp+e.text
            text.append(tmp)
        except:
            publish_date.append("Not available")
            text.append("Not Available")
        time.sleep(3)

    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
    dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
    import pandas as pd
    df=pd.DataFrame(dict)
    df2=df.copy()


    for p in range(len(df2)):
        if df2['Publish Date'][p]=="Not available":
            df2.drop([p],inplace=True)
    #df2.reset_index()
    df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
    df2.reset_index(drop=True,inplace=True)
    return df2
    #df3.to_csv('Prothom_Alo_Description.txt',  index=False)