Spaces:

Thamed-Chowdhury
/

Automated_Accident_Dataset

Sleeping

App Files Files Community

Thamed-Chowdhury commited on Aug 23, 2024

Commit

8ca1efe

verified ·

1 Parent(s): 948ae5f

Update Prothom_alo_fully_scraped.py

Browse files

Files changed (1) hide show

Prothom_alo_fully_scraped.py +86 -80

Prothom_alo_fully_scraped.py CHANGED Viewed

@@ -1,80 +1,86 @@
-def get_data(number):
-    print("Running Prothom_alo_fully_scraped")
-    ##Necessary imports
-    from selenium import webdriver
-    from selenium.webdriver import chrome
-    from selenium.webdriver import ChromeOptions
-    options = ChromeOptions()
-    options.add_argument("--headless=new")
-    driver = webdriver.Chrome(options=options)
-    ## Finding Elements by XPATH
-    from selenium.webdriver.common.by import By
-    driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)
-    import time
-    news_list=[]
-    news_link=[]
-    l=0
-    for i in range(number):
-        if i<15:
-            txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
-            news_list.append(txt.text)
-            news_link.append(txt.get_attribute("href"))
-        else:
-            if (i-15)%10==0:
-                time.sleep(5)
-                last_height = driver.execute_script("return document.body.scrollHeight")
-                driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
-                try:
-                    driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
-                except:
-                    l=1
-                if l==1:
-                    time.sleep(5)
-                    try:
-                        driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
-                    except:
-                        time.sleep(5)
-                        driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
-                    l=0
-                time.sleep(5)
-            txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
-            news_list.append(txt.text)
-            news_link.append(txt.get_attribute("href"))
-    ###### Scraping Publish Date and Description ######
-    publish_date=[]
-    text=[]
-    for i in range (len(news_link)):
-        driver.get(news_link[i])
-        try:
-            publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
-            tmp=""
-            elements = driver.find_elements(By.TAG_NAME, 'p')
-            for e in elements:
-                tmp=tmp+e.text
-            text.append(tmp)
-        except:
-            publish_date.append("Not available")
-            text.append("Not Available")
-        time.sleep(3)
-    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
-    dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
-    import pandas as pd
-    df=pd.DataFrame(dict)
-    df2=df.copy()
-    for p in range(len(df2)):
-        if df2['Publish Date'][p]=="Not available":
-            df2.drop([p],inplace=True)
-    #df2.reset_index()
-    df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
-    df2.reset_index(drop=True,inplace=True)
-    return df2
-    #df3.to_csv('Prothom_Alo_Description.txt',  index=False)

+def get_data(number):
+    print("Running Prothom_alo_fully_scraped")
+    ##Necessary imports
+    from selenium import webdriver
+    from selenium.webdriver import chrome
+    from selenium.webdriver import ChromeOptions
+    options = ChromeOptions()
+    options.add_argument("enable-automation");
+    options.add_argument("--window-size=1920,1080");
+    options.add_argument("--no-sandbox");
+    options.add_argument("--disable-extensions");
+    options.add_argument("--dns-prefetch-disable");
+    options.add_argument("--disable-gpu");
+    options.add_argument("--headless=new")
+    driver = webdriver.Chrome(options=options)
+    ## Finding Elements by XPATH
+    from selenium.webdriver.common.by import By
+    driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)
+    import time
+    news_list=[]
+    news_link=[]
+    l=0
+    for i in range(number):
+        if i<15:
+            txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
+            news_list.append(txt.text)
+            news_link.append(txt.get_attribute("href"))
+        else:
+            if (i-15)%10==0:
+                time.sleep(5)
+                last_height = driver.execute_script("return document.body.scrollHeight")
+                driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
+                try:
+                    driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
+                except:
+                    l=1
+                if l==1:
+                    time.sleep(5)
+                    try:
+                        driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
+                    except:
+                        time.sleep(5)
+                        driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
+                    l=0
+                time.sleep(5)
+            txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
+            news_list.append(txt.text)
+            news_link.append(txt.get_attribute("href"))
+    ###### Scraping Publish Date and Description ######
+    publish_date=[]
+    text=[]
+    for i in range (len(news_link)):
+        driver.get(news_link[i])
+        try:
+            publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
+            tmp=""
+            elements = driver.find_elements(By.TAG_NAME, 'p')
+            for e in elements:
+                tmp=tmp+e.text
+            text.append(tmp)
+        except:
+            publish_date.append("Not available")
+            text.append("Not Available")
+        time.sleep(3)
+    #### Converting the list to a pandas dataframe by converting the list to a dictionary  ###
+    dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
+    import pandas as pd
+    df=pd.DataFrame(dict)
+    df2=df.copy()
+    for p in range(len(df2)):
+        if df2['Publish Date'][p]=="Not available":
+            df2.drop([p],inplace=True)
+    #df2.reset_index()
+    df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
+    df2.reset_index(drop=True,inplace=True)
+    return df2
+    #df3.to_csv('Prothom_Alo_Description.txt',  index=False)