Thamed-Chowdhury commited on
Commit
b08ae77
·
verified ·
1 Parent(s): 5488250

Delete Dhaka_Tribune_Fully_Scraped.py

Browse files
Files changed (1) hide show
  1. Dhaka_Tribune_Fully_Scraped.py +0 -69
Dhaka_Tribune_Fully_Scraped.py DELETED
@@ -1,69 +0,0 @@
1
- def get_data(number):
2
- # Dhaka Tribnune implement
3
- ##Necessary imports
4
- from selenium import webdriver
5
- from selenium.webdriver import chrome
6
- from selenium.webdriver import ChromeOptions
7
- import math
8
- options = ChromeOptions()
9
- options.add_argument("enable-automation")
10
- options.add_argument("--window-size=1920,1080")
11
- options.add_argument("--no-sandbox")
12
- options.add_argument("--disable-extensions")
13
- options.add_argument("--dns-prefetch-disable")
14
- options.add_argument("--disable-gpu")
15
- #options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
16
- options.add_argument("--headless=new")
17
- driver = webdriver.Chrome(options=options)
18
- ## Finding Elements by XPATH
19
- from selenium.webdriver.common.by import By
20
-
21
-
22
- driver.get("https://www.dhakatribune.com/topic/road-accident")
23
-
24
- #### Scraping News Title and News Link ####
25
- import time
26
- news_list=[]
27
- news_link=[]
28
- publish_date=[]
29
- row_counter=0
30
- news_counter=0
31
- for i in range(number):
32
- if i==0:
33
- row_counter=1
34
- else:
35
- row_counter=math.ceil(i/4)
36
- news_counter=i%4+1
37
- #time.sleep(5)
38
- if (i+1)!=0 and (i+1)%20==0:
39
- last_height = driver.execute_script("return document.body.scrollHeight")
40
- driver.execute_script(f"window.scrollTo(0, {last_height})")
41
- driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
42
- time.sleep(10)
43
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2/a')
44
- #publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
45
- news_list.append(txt.text)
46
- news_link.append(txt.get_attribute("href"))
47
-
48
- # Goose3 extraction
49
- for i in range(len(news_link)):
50
- from deep_translator import GoogleTranslator
51
- from goose3 import Goose
52
- from datetime import datetime
53
- g = Goose()
54
- description=[]
55
- News_title=[]
56
- publish_date=[]
57
- for i in range(len(news_link)):
58
- article = g.extract(url=news_link[i])
59
- News_title.append(article.title)
60
- description.append(article.cleaned_text)
61
- publish_date.append(article.publish_date)
62
- # Convert the dates to "day-month-year" format
63
- formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
64
-
65
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
66
- dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
67
- import pandas as pd
68
- df=pd.DataFrame(dict)
69
- return df