Thamed-Chowdhury commited on
Commit
702dc43
·
verified ·
1 Parent(s): f2e4cf7

Upload 2 files

Browse files
Daily_Star_fully_scraped.py CHANGED
@@ -1,85 +1,90 @@
1
- def get_data(number):
2
-
3
-
4
- ##Necessary imports
5
- from selenium import webdriver
6
- from selenium.webdriver import chrome
7
- from selenium.webdriver import ChromeOptions
8
- options = ChromeOptions()
9
- options.add_argument("enable-automation");
10
- options.add_argument("--window-size=1920,1080");
11
- options.add_argument("--no-sandbox");
12
- options.add_argument("--disable-extensions");
13
- options.add_argument("--dns-prefetch-disable");
14
- options.add_argument("--disable-gpu");
15
- options.add_argument("--headless=new")
16
- driver = webdriver.Chrome(options=options)
17
- ## Finding Elements by XPATH
18
- from selenium.webdriver.common.by import By
19
-
20
- driver.get("https://www.thedailystar.net/tags/road-accident")
21
-
22
- """
23
- Browsing with browser open codes:
24
- ##Necessary imports
25
- from selenium import webdriver
26
- from selenium.webdriver import chrome
27
-
28
- driver = webdriver.Chrome()
29
- ## Finding Elements by XPATH
30
- from selenium.webdriver.common.by import By
31
- driver.get("https://en.prothomalo.com/topic/Road-accident")
32
- """
33
- import time
34
- news_list=[]
35
- news_link=[]
36
- publish_date=[]
37
- for i in range(number):
38
- #time.sleep(5)
39
- if (i+1)!=0 and (i+1)%10==0:
40
- last_height = driver.execute_script("return document.body.scrollHeight")
41
- driver.execute_script(f"window.scrollTo(0, {last_height-950})")
42
- driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[2]/ul/li/a').click()
43
- time.sleep(10)
44
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/div[2]/div[2]/h3/a')
45
- publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
46
- news_list.append(txt.text)
47
- news_link.append(txt.get_attribute("href"))
48
-
49
-
50
-
51
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
52
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
53
- import pandas as pd
54
- df=pd.DataFrame(dict)
55
-
56
-
57
- ############################################### Description Exctraction #################################################
58
- from newspaper import Article
59
-
60
-
61
- text=[]
62
- for i in range(len(df)):
63
- url = df['News Link'][i]
64
- article = Article(url)
65
- article.download()
66
- article.parse()
67
-
68
- text.append(article.text)
69
-
70
- df2=df.assign(Description=text)
71
-
72
-
73
- for p in range(len(df2)):
74
- if df2['Publish Date'][p]=="Not available":
75
- df2.drop([p],inplace=True)
76
- #df2.reset_index()
77
-
78
- df2.reset_index(drop=True,inplace=True)
79
-
80
- df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
81
-
82
-
83
-
84
- return df2
85
- #df3.to_csv('Prothom_Alo_Description.txt', index=False)
 
 
 
 
 
 
1
+ def get_data(number):
2
+ print("Running Daily_Star_Fully_Scraped")
3
+ ##Necessary imports
4
+ from selenium import webdriver
5
+ from selenium.webdriver import chrome
6
+ from selenium.webdriver import ChromeOptions
7
+ options = ChromeOptions()
8
+ options.add_argument("enable-automation")
9
+ options.add_argument("--window-size=1920,1080")
10
+ options.add_argument("--no-sandbox")
11
+ options.add_argument("--disable-extensions")
12
+ options.add_argument("--dns-prefetch-disable")
13
+ options.add_argument("--disable-gpu")
14
+ options.add_argument("--headless=new")
15
+ driver = webdriver.Chrome(options=options)
16
+ ## Finding Elements by XPATH
17
+ from selenium.webdriver.common.by import By
18
+
19
+ driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
20
+ ### Extracting first 8 news seperately
21
+ import time
22
+ news_list=[]
23
+ news_link=[]
24
+ for i in range(2,10):
25
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
26
+ news_list.append(txt.text)
27
+ news_link.append(txt.get_attribute("href"))
28
+ # Rest of the News_title and news link extraction
29
+ if(number>8):
30
+ number=number-8
31
+ import time
32
+ for i in range(number):
33
+ #time.sleep(5)
34
+ if (i+1)!=0 and (i+1)%10==0:
35
+ last_height = driver.execute_script("return document.body.scrollHeight")
36
+ driver.execute_script(f"window.scrollTo(0, {last_height-950})")
37
+ driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
38
+ time.sleep(10)
39
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
40
+ news_list.append(txt.text)
41
+ news_link.append(txt.get_attribute("href"))
42
+ ###### Scraping Publish Date ######
43
+ publish_date = []
44
+ for i in range(len(news_link)):
45
+ driver.get(news_link[i])
46
+
47
+ # Wait for 6 seconds, then stop the page load
48
+ time.sleep(6)
49
+ driver.execute_script("window.stop();")
50
+
51
+ try:
52
+ # Locate and extract the publish date element
53
+ publish_date_element = driver.find_element('xpath', '/html/body/div[3]/div[2]/div/div/div[2]/main/div/div[2]/div[1]/div[2]/div/div[1]/div[1]/div/div/div[1]/div[2]/div[2]')
54
+ publish_date.append(publish_date_element.text)
55
+ except:
56
+ publish_date.append("Not available")
57
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
58
+ dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
59
+ import pandas as pd
60
+ df=pd.DataFrame(dict)
61
+ if(number <=8 ):
62
+ df = df.head(number)
63
+
64
+ ############################################### Description Exctraction #################################################
65
+ print('Description Extraction Started')
66
+ from newspaper import Article
67
+
68
+
69
+ text=[]
70
+ for i in range(len(df)):
71
+ url = df['News Link'][i]
72
+ article = Article(url)
73
+ article.download()
74
+ article.parse()
75
+
76
+ text.append(article.text)
77
+
78
+ df2=df.assign(Description=text)
79
+
80
+
81
+ for p in range(len(df2)):
82
+ if df2['Publish Date'][p]=="Not available":
83
+ df2.drop([p],inplace=True)
84
+ #df2.reset_index()
85
+
86
+ df2.reset_index(drop=True,inplace=True)
87
+
88
+ df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
89
+ return df2
90
+
Prothom_alo_fully_scraped.py CHANGED
@@ -1,86 +1,80 @@
1
- def get_data(number):
2
- print("Running Prothom_alo_fully_scraped")
3
- ##Necessary imports
4
- from selenium import webdriver
5
- from selenium.webdriver import chrome
6
- from selenium.webdriver import ChromeOptions
7
- options = ChromeOptions()
8
- options.add_argument("enable-automation");
9
- options.add_argument("--window-size=1920,1080");
10
- options.add_argument("--no-sandbox");
11
- options.add_argument("--disable-extensions");
12
- options.add_argument("--dns-prefetch-disable");
13
- options.add_argument("--disable-gpu");
14
- options.add_argument("--headless=new")
15
- driver = webdriver.Chrome(options=options)
16
- ## Finding Elements by XPATH
17
- from selenium.webdriver.common.by import By
18
-
19
- driver.get("https://en.prothomalo.com/search?q=road%20accident%20dhaka",)
20
-
21
- import time
22
- news_list=[]
23
- news_link=[]
24
- l=0
25
- for i in range(number):
26
- if i<15:
27
-
28
- txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
29
- news_list.append(txt.text)
30
- news_link.append(txt.get_attribute("href"))
31
- else:
32
- if (i-15)%10==0:
33
- time.sleep(5)
34
- last_height = driver.execute_script("return document.body.scrollHeight")
35
- driver.execute_script(f"window.scrollTo(0, {last_height-1200})")
36
- try:
37
-
38
- driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/span').click()
39
- except:
40
- l=1
41
- if l==1:
42
- time.sleep(5)
43
- try:
44
- driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
45
- except:
46
- time.sleep(5)
47
- driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]').click()
48
- l=0
49
- time.sleep(5)
50
- txt=driver.find_element('xpath',f'/html/body/div/div[6]/div/div/div[1]/div[3]/div[{i+1}]/div/div/div[2]/div/h3/a')
51
- news_list.append(txt.text)
52
- news_link.append(txt.get_attribute("href"))
53
-
54
- ###### Scraping Publish Date and Description ######
55
-
56
- publish_date=[]
57
- text=[]
58
- for i in range (len(news_link)):
59
- driver.get(news_link[i])
60
- try:
61
- publish_date.append(driver.find_element('xpath','/html/body/div/div[6]/div/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/time/span').text)
62
- tmp=""
63
- elements = driver.find_elements(By.TAG_NAME, 'p')
64
- for e in elements:
65
- tmp=tmp+e.text
66
- text.append(tmp)
67
- except:
68
- publish_date.append("Not available")
69
- text.append("Not Available")
70
- time.sleep(3)
71
-
72
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
73
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
74
- import pandas as pd
75
- df=pd.DataFrame(dict)
76
- df2=df.copy()
77
-
78
-
79
- for p in range(len(df2)):
80
- if df2['Publish Date'][p]=="Not available":
81
- df2.drop([p],inplace=True)
82
- #df2.reset_index()
83
- df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
84
- df2.reset_index(drop=True,inplace=True)
85
- return df2
86
- #df3.to_csv('Prothom_Alo_Description.txt', index=False)
 
1
+ def get_data(number):
2
+ print("Running Prothom_alo_fully_scraped")
3
+ ##Necessary imports
4
+ from deep_translator import GoogleTranslator
5
+ from selenium import webdriver
6
+ from selenium.webdriver import chrome
7
+ from selenium.webdriver import ChromeOptions
8
+ options = ChromeOptions()
9
+ options.add_argument("--headless=new")
10
+ driver = webdriver.Chrome(options=options)
11
+ ## Finding Elements by XPATH
12
+ from selenium.webdriver.common.by import By
13
+
14
+ driver.get("https://www.prothomalo.com/topic/%E0%A6%B8%E0%A7%9C%E0%A6%95-%E0%A6%A6%E0%A7%81%E0%A6%B0%E0%A7%8D%E0%A6%98%E0%A6%9F%E0%A6%A8%E0%A6%BE")
15
+
16
+ import time, math
17
+ news_list=[]
18
+ news_link=[]
19
+ publish_date=[]
20
+ if number<=15:
21
+ txt=driver.find_elements(By.CLASS_NAME, "title-link")
22
+ date=driver.find_elements(By.TAG_NAME, "time")
23
+ for i in range(number):
24
+ news_list.append(txt[i].text)
25
+ news_link.append(txt[i].get_attribute("href"))
26
+ publish_date.append(date[i].text)
27
+
28
+ else:
29
+ clck=int((number-25)/15 + 2)
30
+ for i in range(clck):
31
+ print(i)
32
+ time.sleep(10)
33
+ last_height = driver.execute_script("return document.body.scrollHeight")
34
+ driver.execute_script(f"window.scrollTo(0, {last_height-850})")
35
+ button=driver.find_elements(By.CLASS_NAME, "tNj8k")
36
+ button[0].click()
37
+ time.sleep(5)
38
+ txt=driver.find_elements(By.CLASS_NAME, "title-link")
39
+ date=driver.find_elements(By.TAG_NAME, "time")
40
+ for i in range(number):
41
+ news_list.append(txt[i].text)
42
+ news_link.append(txt[i].get_attribute("href"))
43
+ publish_date.append(date[i].text)
44
+
45
+ ###### Scraping Description modified for translation######
46
+ text=[]
47
+ for i in range (len(news_link)):
48
+ driver.get(news_link[i])
49
+ try:
50
+ tmp=""
51
+ elements = driver.find_elements(By.TAG_NAME, 'p')
52
+ for i in range(len(elements)):
53
+ if i>2 and len(tmp+elements[i].text) < 2000:
54
+ tmp=tmp+elements[i].text
55
+
56
+ text.append(tmp)
57
+ except:
58
+ text.append("Not Available")
59
+ time.sleep(5)
60
+ ## Translation
61
+ for i in range(len(news_list)):
62
+ news_list[i] = GoogleTranslator(source='auto', target='en').translate(text=news_list[i])
63
+ text[i] = GoogleTranslator(source='auto', target='en').translate(text=text[i])
64
+ publish_date[i] = GoogleTranslator(source='auto', target='en').translate(text=publish_date[i])
65
+
66
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
67
+ dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
68
+ import pandas as pd
69
+ df=pd.DataFrame(dict)
70
+ df2=df.copy()
71
+
72
+
73
+ for p in range(len(df2)):
74
+ if df2['Publish Date'][p]=="Not available":
75
+ df2.drop([p],inplace=True)
76
+ #df2.reset_index()
77
+ df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
78
+ df2.reset_index(drop=True,inplace=True)
79
+ return df2
80
+ #df3.to_csv('Prothom_Alo_Description.txt', index=False)