Spaces:

Thamed-Chowdhury
/

Automated_Accident_Dataset

Running

App Files Files Community

Thamed-Chowdhury commited on Sep 28, 2024

Commit

8e1d875

verified ·

1 Parent(s): af1cd50

Upload 5 files

Browse files

Files changed (5) hide show

Daily_Star_fully_scraped.py +66 -92
Dhaka_Tribune_Fully_Scraped.py +69 -89
LLM_automation_GPT.py +2 -2
LLM_automation_Groq.py +6 -3
Prothom_alo_fully_scraped.py +79 -106

Daily_Star_fully_scraped.py CHANGED Viewed

@@ -1,92 +1,66 @@
-def get_data(number):
-    print("Running Daily_Star_Fully_Scraped")
-    ##Necessary imports
-    from selenium import webdriver
-    from selenium.webdriver import chrome
-    from selenium.webdriver import ChromeOptions
-    options = ChromeOptions()
-    options.add_argument("enable-automation")
-    options.add_argument("--window-size=1920,1080")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-extensions")
-    options.add_argument("--dns-prefetch-disable")
-    options.add_argument("--disable-gpu")
-    options.add_argument("--headless=new")
-    driver = webdriver.Chrome(options=options)
-    # Set a timeout for the page to load (in seconds)
-    driver.set_page_load_timeout(10)  # Limit page loading time to 10 seconds
-    ## Finding Elements by XPATH
-    from selenium.webdriver.common.by import By
-    driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
-    ### Extracting first 8 news seperately
-    import time
-    news_list=[]
-    news_link=[]
-    for i in range(2,10):
-        txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
-        news_list.append(txt.text)
-        news_link.append(txt.get_attribute("href"))
-    # Rest of the News_title and news link extraction
-    number2=number-8
-    import time
-    if number2>0:
-        for i in range(number2):
-            #time.sleep(5)
-            if (i+1)!=0 and (i+1)%10==0:
-                last_height = driver.execute_script("return document.body.scrollHeight")
-                driver.execute_script(f"window.scrollTo(0, {last_height-950})")
-                driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
-                time.sleep(10)
-            txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
-            news_list.append(txt.text)
-            news_link.append(txt.get_attribute("href"))
-    ###### Scraping Publish Date ######
-    publish_date=[]
-    for i in range (len(news_link)):
-        try:
-            driver.get(news_link[i])
-        except:
-            time.sleep(30)
-            driver.get(news_link[i])
-        time.sleep(3)
-        driver.execute_script("window.stop();")
-        try:
-            publish_date.append(driver.find_element('xpath','/html/body/div[3]/div[2]/div/div/div[2]/main/div/div[2]/div[1]/div[2]/div/div[1]/div[1]/div/div/div[1]/div[2]/div[2]').text)
-        except:
-            publish_date.append("Not available")
-    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
-    dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
-    import pandas as pd
-    df=pd.DataFrame(dict)
-    if(number <=8 ):
-        df = df.head(number)
-    ############################################### Description Exctraction #################################################
-    print('Description Extraction Started')
-    from newspaper import Article
-    text=[]
-    for i in range(len(df)):
-        url = df['News Link'][i]
-        article = Article(url)
-        article.download()
-        article.parse()
-        text.append(article.text)
-    df2=df.assign(Description=text)
-    for p in range(len(df2)):
-        if df2['Publish Date'][p]=="Not available":
-            df2.drop([p],inplace=True)
-    #df2.reset_index()
-    df2.reset_index(drop=True,inplace=True)
-    df2["Date + Desc"]=df2['Publish Date'] + ".     News Description:"+ df2['Description']
-    return df2

+def get_data(number):
+    print("Running Daily_Star_Fully_Scraped")
+    ##Necessary imports
+    from selenium import webdriver
+    from selenium.webdriver import chrome
+    from selenium.webdriver import ChromeOptions
+    options = ChromeOptions()
+    options.add_argument("enable-automation")
+    options.add_argument("--window-size=1920,1080")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-extensions")
+    options.add_argument("--dns-prefetch-disable")
+    options.add_argument("--disable-gpu")
+    # options.add_argument("--headless=new")
+    driver = webdriver.Chrome(options=options)
+    # Set a timeout for the page to load (in seconds)
+    driver.set_page_load_timeout(10)  # Limit page loading time to 10 seconds
+    ## Finding Elements by XPATH
+    from selenium.webdriver.common.by import By
+    driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
+    ### Extracting first 8 news seperately
+    import time
+    news_list=[]
+    news_link=[]
+    for i in range(2,10):
+        txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
+        news_list.append(txt.text)
+        news_link.append(txt.get_attribute("href"))
+    # Rest of the News_title and news link extraction
+    number2=number-8
+    import time
+    if number2>0:
+        for i in range(number2):
+            #time.sleep(5)
+            if (i+1)!=0 and (i+1)%10==0:
+                last_height = driver.execute_script("return document.body.scrollHeight")
+                driver.execute_script(f"window.scrollTo(0, {last_height-950})")
+                driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
+                time.sleep(10)
+            txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
+            news_list.append(txt.text)
+            news_link.append(txt.get_attribute("href"))
+    # Goose3 extraction
+    for i in range(len(news_link)):
+        from deep_translator import GoogleTranslator
+        from goose3 import Goose
+        from datetime import datetime
+        g = Goose()
+        description=[]
+        News_title=[]
+        publish_date=[]
+        for i in range(len(news_link)):
+            article = g.extract(url=news_link[i])
+            News_title.append(article.title)
+            description.append(article.cleaned_text)
+            publish_date.append(article.publish_date)
+    # Convert the dates to "day-month-year" format
+    formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
+    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
+    dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
+    import pandas as pd
+    df=pd.DataFrame(dict)
+    return df

Dhaka_Tribune_Fully_Scraped.py CHANGED Viewed

@@ -1,89 +1,69 @@
-def get_data(number):
-    ##Necessary imports
-    from selenium import webdriver
-    from selenium.webdriver import chrome
-    from selenium.webdriver import ChromeOptions
-    import math
-    options = ChromeOptions()
-    options.add_argument("enable-automation");
-    options.add_argument("--window-size=1920,1080");
-    options.add_argument("--no-sandbox");
-    options.add_argument("--disable-extensions");
-    options.add_argument("--dns-prefetch-disable");
-    options.add_argument("--disable-gpu");
-    options.add_argument("--headless=new")
-    driver = webdriver.Chrome(options=options)
-    ## Finding Elements by XPATH
-    from selenium.webdriver.common.by import By
-    driver.get("https://www.dhakatribune.com/topic/road-accident")
-    #### Scraping News Title and News Link ####
-    print(driver.current_url)
-    import time
-    news_list=[]
-    news_link=[]
-    publish_date=[]
-    row_counter=0
-    news_counter=0
-    for i in range(number):
-        if i==0:
-            row_counter=1
-        else:
-            row_counter=math.ceil(i/4)
-        news_counter=i%4+1
-        #time.sleep(5)
-        if (i+1)!=0 and (i+1)%20==0:
-            last_height = driver.execute_script("return document.body.scrollHeight")
-            driver.execute_script(f"window.scrollTo(0, {last_height})")
-            driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
-            time.sleep(10)
-        txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2')
-        #publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
-        news_list.append(txt.text)
-        news_link.append(txt.get_attribute("href"))
-    ###### Scraping Publish Date ######
-    publish_date=[]
-    for i in range (len(news_link)):
-        driver.get(news_link[i])
-        time.sleep(6)
-        driver.execute_script("window.stop();")
-        try:
-            publish_date.append(driver.find_element('xpath','/html/body/div[3]/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/div/div/div/div/div/div[2]/div/div/div[3]/div/div[1]/div/div[2]/span[1]').text)
-        except:
-            publish_date.append("Not available")
-    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
-    dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
-    import pandas as pd
-    df=pd.DataFrame(dict)
-    ############################################ Description Extraction ###################################################
-    from newspaper import Article
-    text=[]
-    for i in range(len(df)):
-        url = df['News Link'][i]
-        article = Article(url)
-        article.download()
-        article.parse()
-        text.append(article.text)
-    df2=df.assign(Description=text)
-    for p in range(len(df2)):
-        if df2['Publish Date'][p]=="Not available":
-            df2.drop([p],inplace=True)
-    df2.reset_index(drop=True,inplace=True)
-    df2["Date + Desc"]=df2['Publish Date'] + ".     News Description:"+ df2['Description']
-    return df2
-    #df3.to_csv('Dhaka_Tribune_Description.txt',  index=False)

+def get_data(number):
+    # Dhaka Tribnune implement
+    ##Necessary imports
+    from selenium import webdriver
+    from selenium.webdriver import chrome
+    from selenium.webdriver import ChromeOptions
+    import math
+    options = ChromeOptions()
+    options.add_argument("enable-automation")
+    options.add_argument("--window-size=1920,1080")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-extensions")
+    options.add_argument("--dns-prefetch-disable")
+    options.add_argument("--disable-gpu")
+    #options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
+    options.add_argument("--headless=new")
+    driver = webdriver.Chrome(options=options)
+    ## Finding Elements by XPATH
+    from selenium.webdriver.common.by import By
+    driver.get("https://www.dhakatribune.com/topic/road-accident")
+    #### Scraping News Title and News Link ####
+    import time
+    news_list=[]
+    news_link=[]
+    publish_date=[]
+    row_counter=0
+    news_counter=0
+    for i in range(number):
+        if i==0:
+            row_counter=1
+        else:
+            row_counter=math.ceil(i/4)
+        news_counter=i%4+1
+        #time.sleep(5)
+        if (i+1)!=0 and (i+1)%20==0:
+            last_height = driver.execute_script("return document.body.scrollHeight")
+            driver.execute_script(f"window.scrollTo(0, {last_height})")
+            driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
+            time.sleep(10)
+        txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2/a')
+        #publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
+        news_list.append(txt.text)
+        news_link.append(txt.get_attribute("href"))
+    # Goose3 extraction
+    for i in range(len(news_link)):
+        from deep_translator import GoogleTranslator
+        from goose3 import Goose
+        from datetime import datetime
+        g = Goose()
+        description=[]
+        News_title=[]
+        publish_date=[]
+        for i in range(len(news_link)):
+            article = g.extract(url=news_link[i])
+            News_title.append(article.title)
+            description.append(article.cleaned_text)
+            publish_date.append(article.publish_date)
+    # Convert the dates to "day-month-year" format
+    formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
+    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
+    dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
+    import pandas as pd
+    df=pd.DataFrame(dict)
+    return df

LLM_automation_GPT.py CHANGED Viewed

@@ -75,7 +75,7 @@ def create_data(description):
         dj2.append(res(i))
     ### Finding vehicle
     def res2(i):
-        response=chain.invoke({"question" : df2['Date + Desc'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
         return response
     #### vehicle list contains all vehicles involved:
     vehicles=[]
@@ -121,6 +121,6 @@ def create_data(description):
     df2["Road_Characteristic"]=Road_Characteristic
     df2["Pedestrian_Involved"]=Pedestrian_Involved
     df2["Vehicles Involved"]=vehicles
-    df3=df2.drop(columns=['Description','Date + Desc','Report Type'])
     return df3

         dj2.append(res(i))
     ### Finding vehicle
     def res2(i):
+        response=chain.invoke({"question" : df2['Description'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
         return response
     #### vehicle list contains all vehicles involved:
     vehicles=[]
     df2["Road_Characteristic"]=Road_Characteristic
     df2["Pedestrian_Involved"]=Pedestrian_Involved
     df2["Vehicles Involved"]=vehicles
+    df3=df2.drop(columns=['Description','Report Type'])
     return df3

LLM_automation_Groq.py CHANGED Viewed

@@ -15,7 +15,10 @@ def create_data(description):
     ### Set all api keys:
     #os.environ["LANGCHAIN_TRACING_V2"]="true" ### Will automatically trace our codes using Langsmith
-    os.environ["GROQ_API_KEY"]="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"  #### Will be used for monitoring the calls to and from llm (both free and paid)
     ### Create Prompt Template:
     prompt=ChatPromptTemplate.from_messages(
@@ -80,7 +83,7 @@ def create_data(description):
 ### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
     def res2(i):
-        response=chain.invoke({"question" : df2['Date + Desc'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
         return response
     #### dj2 list contains all column values seperated by comma:
     vehicles=[]
@@ -124,7 +127,7 @@ def create_data(description):
     df2["Road_Characteristic"]=Road_Characteristic
     df2["Pedestrian_Involved"]=Pedestrian_Involved
     df2["Vehicles_involved"]=vehicles
-    df3=df2.drop(columns=['Description','Date + Desc','Report Type'])
     return df3

     ### Set all api keys:
     #os.environ["LANGCHAIN_TRACING_V2"]="true" ### Will automatically trace our codes using Langsmith
+    #os.environ["GROQ_API_KEY"]="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"  #### Will be used for monitoring the calls to and from llm (both free and paid)
+    os.environ["GROQ_API_KEY"]="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"
     ### Create Prompt Template:
     prompt=ChatPromptTemplate.from_messages(
 ### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
     def res2(i):
+        response=chain.invoke({"question" : df2['Description'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
         return response
     #### dj2 list contains all column values seperated by comma:
     vehicles=[]
     df2["Road_Characteristic"]=Road_Characteristic
     df2["Pedestrian_Involved"]=Pedestrian_Involved
     df2["Vehicles_involved"]=vehicles
+    df3=df2.drop(columns=['Description','Report Type'])
     return df3

Prothom_alo_fully_scraped.py CHANGED Viewed

@@ -1,106 +1,79 @@
-def get_data(number):
-    print("Running Prothom_alo_fully_scraped")
-    ##Necessary imports
-    from deep_translator import GoogleTranslator
-    from selenium import webdriver
-    from selenium.webdriver import chrome
-    from selenium.webdriver import ChromeOptions
-    from datetime import datetime, timedelta
-    import re
-    options = ChromeOptions()
-    options.add_argument("enable-automation")
-    options.add_argument("--window-size=1920,1080")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-extensions")
-    options.add_argument("--dns-prefetch-disable")
-    options.add_argument("--disable-gpu")
-    options.add_argument("--headless=new")
-    driver = webdriver.Chrome(options=options)
-    ## Finding Elements by XPATH
-    from selenium.webdriver.common.by import By
-    import time, math
-    driver.get("https://www.prothomalo.com/topic/%E0%A6%B8%E0%A7%9C%E0%A6%95-%E0%A6%A6%E0%A7%81%E0%A6%B0%E0%A7%8D%E0%A6%98%E0%A6%9F%E0%A6%A8%E0%A6%BE")
-    time.sleep(15)
-    news_list=[]
-    news_link=[]
-    publish_date=[]
-    if number<=15:
-        print("Entered number<15 condition")
-        txt=driver.find_elements(By.CLASS_NAME, "title-link")
-        date=driver.find_elements(By.TAG_NAME, "time")
-        for i in range(number):
-            news_list.append(txt[i].text)
-            news_link.append(txt[i].get_attribute("href"))
-            publish_date.append(date[i].text)
-    else:
-        clck=int((number-25)/15 + 2)
-        for i in range(clck):
-            print(i)
-            time.sleep(10)
-            last_height = driver.execute_script("return document.body.scrollHeight")
-            driver.execute_script(f"window.scrollTo(0, {last_height-1050})")
-            button=driver.find_elements(By.CLASS_NAME, "tNj8k")
-            button[0].click()
-        time.sleep(5)
-        txt=driver.find_elements(By.CLASS_NAME, "title-link")
-        date=driver.find_elements(By.TAG_NAME, "time")
-        for i in range(number):
-            news_list.append(txt[i].text)
-            news_link.append(txt[i].get_attribute("href"))
-            publish_date.append(date[i].text)
-    ###### Scraping Description modified for translation######
-    print("Description modified Translation block")
-    text=[]
-    for i in range (len(news_link)):
-        print(i,news_link[i])
-        driver.get(news_link[i])
-        try:
-            tmp=""
-            elements = driver.find_elements(By.TAG_NAME, 'p')
-            for i in range(len(elements)):
-                if i>2 and len(tmp+elements[i].text) < 2000:
-                    tmp=tmp+elements[i].text
-            text.append(tmp)
-        except:
-            text.append("Not Available")
-        time.sleep(5)
-    ## Translation
-    print("Translation Started")
-    for i in range(len(news_list)):
-        news_list[i] = GoogleTranslator(source='auto', target='en').translate(text=news_list[i])
-        text[i] = GoogleTranslator(source='auto', target='en').translate(text=text[i])
-        publish_date[i] = GoogleTranslator(source='auto', target='en').translate(text=publish_date[i])
-    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
-    dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
-    import pandas as pd
-    df=pd.DataFrame(dict)
-    df2=df.copy()
-    for p in range(len(df2)):
-        if df2['Publish Date'][p]=="Not available":
-            df2.drop([p],inplace=True)
-    #df2.reset_index()
-    df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
-    df2.reset_index(drop=True,inplace=True)
-    # Function to convert relative time to date and format as day-month-year
-    def convert_relative_time_to_date(time_str):
-        if 'hours ago' in time_str or 'hour ago' in time_str:
-            hours = int(re.search(r'(\d+)', time_str).group(1))
-            return (datetime.now() - timedelta(hours=hours)).strftime('%d-%m-%Y')
-        elif 'days ago' in time_str or 'day ago' in time_str:
-            days = int(re.search(r'(\d+)', time_str).group(1))
-            return (datetime.now() - timedelta(days=days)).strftime('%d-%m-%Y')
-        else:
-            # If it's already a date string, return it in day-month-year format
-            return pd.to_datetime(time_str).strftime('%d-%m-%Y')
-    # Apply the function to the DataFrame
-    df2['Publish Date'] = df2['Publish Date'].apply(convert_relative_time_to_date)
-    return df2
-    #df3.to_csv('Prothom_Alo_Description.txt',  index=False)

+def get_data(number):
+    print("Running Prothom_alo_fully_scraped")
+    ##Necessary imports
+    from deep_translator import GoogleTranslator
+    from selenium import webdriver
+    from selenium.webdriver import chrome
+    from selenium.webdriver import ChromeOptions
+    from datetime import datetime, timedelta
+    import re
+    #PROXY = "45.251.231.113:5678"
+    options = ChromeOptions()
+    #options.add_argument('--proxy-server=%s' % PROXY)
+    #options.add_argument("--headless=new")
+    driver = webdriver.Chrome(options=options)
+    ## Finding Elements by XPATH
+    from selenium.webdriver.common.by import By
+    import time, math
+    driver.get("https://www.prothomalo.com/topic/%E0%A6%B8%E0%A7%9C%E0%A6%95-%E0%A6%A6%E0%A7%81%E0%A6%B0%E0%A7%8D%E0%A6%98%E0%A6%9F%E0%A6%A8%E0%A6%BE")
+    time.sleep(15)
+    news_list=[]
+    news_link=[]
+    publish_date=[]
+    if number<=15:
+        txt=driver.find_elements(By.CLASS_NAME, "title-link")
+        date=driver.find_elements(By.TAG_NAME, "time")
+        for i in range(number):
+            news_list.append(txt[i].text)
+            news_link.append(txt[i].get_attribute("href"))
+            publish_date.append(date[i].text)
+    else:
+        clck=int((number-25)/15 + 2)
+        for i in range(clck):
+            print(i)
+            time.sleep(10)
+            last_height = driver.execute_script("return document.body.scrollHeight")
+            driver.execute_script(f"window.scrollTo(0, {last_height-1050})")
+            button=driver.find_elements(By.CLASS_NAME, "tNj8k")
+            button[0].click()
+        time.sleep(5)
+        txt=driver.find_elements(By.CLASS_NAME, "title-link")
+        date=driver.find_elements(By.TAG_NAME, "time")
+        for i in range(number):
+            news_list.append(txt[i].text)
+            news_link.append(txt[i].get_attribute("href"))
+            publish_date.append(date[i].text)
+    ###### Scraping Description modified for translation######
+    from deep_translator import GoogleTranslator
+    from goose3 import Goose
+    from datetime import datetime
+    g = Goose()
+    description=[]
+    News_title=[]
+    publish_date=[]
+    for i in range(len(news_link)):
+        print(i)
+        article = g.extract(url=news_link[i])
+        ### Only for Prothom Alo ###
+        # Access the articleBody field
+        data = article.schema
+        article_body = data.get('articleBody')
+        print("Para length", len(article_body))
+        if(len(article_body)>=2200):
+            article_body=article_body[0:2200]
+        bangla_title=article.title
+        english_title = GoogleTranslator(source='auto', target='en').translate(text=bangla_title)
+        News_title.append(english_title)
+        text = GoogleTranslator(source='auto', target='en').translate(text=article_body)
+        description.append(text)
+        publish_date.append(article.publish_date)
+    # Convert the dates to "day-month-year" format
+    formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
+    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
+    dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
+    import pandas as pd
+    df=pd.DataFrame(dict)
+    return df