Thamed-Chowdhury commited on
Commit
8e1d875
·
verified ·
1 Parent(s): af1cd50

Upload 5 files

Browse files
Daily_Star_fully_scraped.py CHANGED
@@ -1,92 +1,66 @@
1
- def get_data(number):
2
- print("Running Daily_Star_Fully_Scraped")
3
- ##Necessary imports
4
- from selenium import webdriver
5
- from selenium.webdriver import chrome
6
- from selenium.webdriver import ChromeOptions
7
- options = ChromeOptions()
8
- options.add_argument("enable-automation")
9
- options.add_argument("--window-size=1920,1080")
10
- options.add_argument("--no-sandbox")
11
- options.add_argument("--disable-extensions")
12
- options.add_argument("--dns-prefetch-disable")
13
- options.add_argument("--disable-gpu")
14
- options.add_argument("--headless=new")
15
- driver = webdriver.Chrome(options=options)
16
- # Set a timeout for the page to load (in seconds)
17
- driver.set_page_load_timeout(10) # Limit page loading time to 10 seconds
18
-
19
- ## Finding Elements by XPATH
20
- from selenium.webdriver.common.by import By
21
- driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
22
- ### Extracting first 8 news seperately
23
- import time
24
- news_list=[]
25
- news_link=[]
26
- for i in range(2,10):
27
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
28
- news_list.append(txt.text)
29
- news_link.append(txt.get_attribute("href"))
30
- # Rest of the News_title and news link extraction
31
- number2=number-8
32
- import time
33
- if number2>0:
34
- for i in range(number2):
35
- #time.sleep(5)
36
- if (i+1)!=0 and (i+1)%10==0:
37
- last_height = driver.execute_script("return document.body.scrollHeight")
38
- driver.execute_script(f"window.scrollTo(0, {last_height-950})")
39
- driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
40
- time.sleep(10)
41
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
42
- news_list.append(txt.text)
43
- news_link.append(txt.get_attribute("href"))
44
- ###### Scraping Publish Date ######
45
-
46
- publish_date=[]
47
- for i in range (len(news_link)):
48
- try:
49
- driver.get(news_link[i])
50
- except:
51
- time.sleep(30)
52
- driver.get(news_link[i])
53
- time.sleep(3)
54
- driver.execute_script("window.stop();")
55
- try:
56
- publish_date.append(driver.find_element('xpath','/html/body/div[3]/div[2]/div/div/div[2]/main/div/div[2]/div[1]/div[2]/div/div[1]/div[1]/div/div/div[1]/div[2]/div[2]').text)
57
- except:
58
- publish_date.append("Not available")
59
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
60
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
61
- import pandas as pd
62
- df=pd.DataFrame(dict)
63
- if(number <=8 ):
64
- df = df.head(number)
65
-
66
- ############################################### Description Exctraction #################################################
67
- print('Description Extraction Started')
68
- from newspaper import Article
69
-
70
-
71
- text=[]
72
- for i in range(len(df)):
73
- url = df['News Link'][i]
74
- article = Article(url)
75
- article.download()
76
- article.parse()
77
-
78
- text.append(article.text)
79
-
80
- df2=df.assign(Description=text)
81
-
82
-
83
- for p in range(len(df2)):
84
- if df2['Publish Date'][p]=="Not available":
85
- df2.drop([p],inplace=True)
86
- #df2.reset_index()
87
-
88
- df2.reset_index(drop=True,inplace=True)
89
-
90
- df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
91
- return df2
92
-
 
1
+ def get_data(number):
2
+ print("Running Daily_Star_Fully_Scraped")
3
+ ##Necessary imports
4
+ from selenium import webdriver
5
+ from selenium.webdriver import chrome
6
+ from selenium.webdriver import ChromeOptions
7
+ options = ChromeOptions()
8
+ options.add_argument("enable-automation")
9
+ options.add_argument("--window-size=1920,1080")
10
+ options.add_argument("--no-sandbox")
11
+ options.add_argument("--disable-extensions")
12
+ options.add_argument("--dns-prefetch-disable")
13
+ options.add_argument("--disable-gpu")
14
+ # options.add_argument("--headless=new")
15
+ driver = webdriver.Chrome(options=options)
16
+ # Set a timeout for the page to load (in seconds)
17
+ driver.set_page_load_timeout(10) # Limit page loading time to 10 seconds
18
+
19
+ ## Finding Elements by XPATH
20
+ from selenium.webdriver.common.by import By
21
+ driver.get("https://www.thedailystar.net/news/bangladesh/accidents-fires")
22
+ ### Extracting first 8 news seperately
23
+ import time
24
+ news_list=[]
25
+ news_link=[]
26
+ for i in range(2,10):
27
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[1]/div/div/div/div[{i}]/div/div/h3/a')
28
+ news_list.append(txt.text)
29
+ news_link.append(txt.get_attribute("href"))
30
+ # Rest of the News_title and news link extraction
31
+ number2=number-8
32
+ import time
33
+ if number2>0:
34
+ for i in range(number2):
35
+ #time.sleep(5)
36
+ if (i+1)!=0 and (i+1)%10==0:
37
+ last_height = driver.execute_script("return document.body.scrollHeight")
38
+ driver.execute_script(f"window.scrollTo(0, {last_height-950})")
39
+ driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[2]/ul/li/a').click()
40
+ time.sleep(10)
41
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[3]/div/div/div[1]/div/div[1]/div[{i+1}]/div[2]/h3/a')
42
+ news_list.append(txt.text)
43
+ news_link.append(txt.get_attribute("href"))
44
+ # Goose3 extraction
45
+ for i in range(len(news_link)):
46
+ from deep_translator import GoogleTranslator
47
+ from goose3 import Goose
48
+ from datetime import datetime
49
+ g = Goose()
50
+ description=[]
51
+ News_title=[]
52
+ publish_date=[]
53
+ for i in range(len(news_link)):
54
+ article = g.extract(url=news_link[i])
55
+ News_title.append(article.title)
56
+ description.append(article.cleaned_text)
57
+ publish_date.append(article.publish_date)
58
+ # Convert the dates to "day-month-year" format
59
+ formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
60
+
61
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
62
+ dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
63
+ import pandas as pd
64
+ df=pd.DataFrame(dict)
65
+ return df
66
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dhaka_Tribune_Fully_Scraped.py CHANGED
@@ -1,89 +1,69 @@
1
- def get_data(number):
2
- ##Necessary imports
3
- from selenium import webdriver
4
- from selenium.webdriver import chrome
5
- from selenium.webdriver import ChromeOptions
6
- import math
7
- options = ChromeOptions()
8
- options.add_argument("enable-automation");
9
- options.add_argument("--window-size=1920,1080");
10
- options.add_argument("--no-sandbox");
11
- options.add_argument("--disable-extensions");
12
- options.add_argument("--dns-prefetch-disable");
13
- options.add_argument("--disable-gpu");
14
- options.add_argument("--headless=new")
15
- driver = webdriver.Chrome(options=options)
16
- ## Finding Elements by XPATH
17
- from selenium.webdriver.common.by import By
18
-
19
-
20
- driver.get("https://www.dhakatribune.com/topic/road-accident")
21
-
22
- #### Scraping News Title and News Link ####
23
- print(driver.current_url)
24
- import time
25
- news_list=[]
26
- news_link=[]
27
- publish_date=[]
28
- row_counter=0
29
- news_counter=0
30
- for i in range(number):
31
- if i==0:
32
- row_counter=1
33
- else:
34
- row_counter=math.ceil(i/4)
35
- news_counter=i%4+1
36
- #time.sleep(5)
37
- if (i+1)!=0 and (i+1)%20==0:
38
- last_height = driver.execute_script("return document.body.scrollHeight")
39
- driver.execute_script(f"window.scrollTo(0, {last_height})")
40
- driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
41
- time.sleep(10)
42
- txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2')
43
- #publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
44
- news_list.append(txt.text)
45
- news_link.append(txt.get_attribute("href"))
46
-
47
- ###### Scraping Publish Date ######
48
- publish_date=[]
49
- for i in range (len(news_link)):
50
- driver.get(news_link[i])
51
- time.sleep(6)
52
- driver.execute_script("window.stop();")
53
- try:
54
- publish_date.append(driver.find_element('xpath','/html/body/div[3]/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/div/div/div/div/div/div[2]/div/div/div[3]/div/div[1]/div/div[2]/span[1]').text)
55
- except:
56
- publish_date.append("Not available")
57
-
58
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
59
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date}
60
- import pandas as pd
61
- df=pd.DataFrame(dict)
62
-
63
-
64
- ############################################ Description Extraction ###################################################
65
-
66
- from newspaper import Article
67
- text=[]
68
- for i in range(len(df)):
69
- url = df['News Link'][i]
70
- article = Article(url)
71
- article.download()
72
- article.parse()
73
-
74
- text.append(article.text)
75
-
76
-
77
- df2=df.assign(Description=text)
78
- for p in range(len(df2)):
79
- if df2['Publish Date'][p]=="Not available":
80
- df2.drop([p],inplace=True)
81
-
82
- df2.reset_index(drop=True,inplace=True)
83
- df2["Date + Desc"]=df2['Publish Date'] + ". News Description:"+ df2['Description']
84
-
85
-
86
-
87
- return df2
88
-
89
- #df3.to_csv('Dhaka_Tribune_Description.txt', index=False)
 
1
+ def get_data(number):
2
+ # Dhaka Tribnune implement
3
+ ##Necessary imports
4
+ from selenium import webdriver
5
+ from selenium.webdriver import chrome
6
+ from selenium.webdriver import ChromeOptions
7
+ import math
8
+ options = ChromeOptions()
9
+ options.add_argument("enable-automation")
10
+ options.add_argument("--window-size=1920,1080")
11
+ options.add_argument("--no-sandbox")
12
+ options.add_argument("--disable-extensions")
13
+ options.add_argument("--dns-prefetch-disable")
14
+ options.add_argument("--disable-gpu")
15
+ #options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
16
+ options.add_argument("--headless=new")
17
+ driver = webdriver.Chrome(options=options)
18
+ ## Finding Elements by XPATH
19
+ from selenium.webdriver.common.by import By
20
+
21
+
22
+ driver.get("https://www.dhakatribune.com/topic/road-accident")
23
+
24
+ #### Scraping News Title and News Link ####
25
+ import time
26
+ news_list=[]
27
+ news_link=[]
28
+ publish_date=[]
29
+ row_counter=0
30
+ news_counter=0
31
+ for i in range(number):
32
+ if i==0:
33
+ row_counter=1
34
+ else:
35
+ row_counter=math.ceil(i/4)
36
+ news_counter=i%4+1
37
+ #time.sleep(5)
38
+ if (i+1)!=0 and (i+1)%20==0:
39
+ last_height = driver.execute_script("return document.body.scrollHeight")
40
+ driver.execute_script(f"window.scrollTo(0, {last_height})")
41
+ driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[2]/button').click()
42
+ time.sleep(10)
43
+ txt=driver.find_element('xpath',f'/html/body/div[3]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div[1]/div[{row_counter}]/div[{news_counter}]/div/div[2]/div/div/div/h2/a')
44
+ #publish_date.append(driver.find_element('xpath',f'/html/body/div[3]/div/div/div/div[2]/main/div/div[2]/div/div[5]/div/div/div[1]/div/div[1]/div[{i+1}]/div[1]').text)
45
+ news_list.append(txt.text)
46
+ news_link.append(txt.get_attribute("href"))
47
+
48
+ # Goose3 extraction
49
+ for i in range(len(news_link)):
50
+ from deep_translator import GoogleTranslator
51
+ from goose3 import Goose
52
+ from datetime import datetime
53
+ g = Goose()
54
+ description=[]
55
+ News_title=[]
56
+ publish_date=[]
57
+ for i in range(len(news_link)):
58
+ article = g.extract(url=news_link[i])
59
+ News_title.append(article.title)
60
+ description.append(article.cleaned_text)
61
+ publish_date.append(article.publish_date)
62
+ # Convert the dates to "day-month-year" format
63
+ formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
64
+
65
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
66
+ dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
67
+ import pandas as pd
68
+ df=pd.DataFrame(dict)
69
+ return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
LLM_automation_GPT.py CHANGED
@@ -75,7 +75,7 @@ def create_data(description):
75
  dj2.append(res(i))
76
  ### Finding vehicle
77
  def res2(i):
78
- response=chain.invoke({"question" : df2['Date + Desc'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
79
  return response
80
  #### vehicle list contains all vehicles involved:
81
  vehicles=[]
@@ -121,6 +121,6 @@ def create_data(description):
121
  df2["Road_Characteristic"]=Road_Characteristic
122
  df2["Pedestrian_Involved"]=Pedestrian_Involved
123
  df2["Vehicles Involved"]=vehicles
124
- df3=df2.drop(columns=['Description','Date + Desc','Report Type'])
125
  return df3
126
 
 
75
  dj2.append(res(i))
76
  ### Finding vehicle
77
  def res2(i):
78
+ response=chain.invoke({"question" : df2['Description'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
79
  return response
80
  #### vehicle list contains all vehicles involved:
81
  vehicles=[]
 
121
  df2["Road_Characteristic"]=Road_Characteristic
122
  df2["Pedestrian_Involved"]=Pedestrian_Involved
123
  df2["Vehicles Involved"]=vehicles
124
+ df3=df2.drop(columns=['Description','Report Type'])
125
  return df3
126
 
LLM_automation_Groq.py CHANGED
@@ -15,7 +15,10 @@ def create_data(description):
15
  ### Set all api keys:
16
 
17
  #os.environ["LANGCHAIN_TRACING_V2"]="true" ### Will automatically trace our codes using Langsmith
18
- os.environ["GROQ_API_KEY"]="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" #### Will be used for monitoring the calls to and from llm (both free and paid)
 
 
 
19
 
20
  ### Create Prompt Template:
21
  prompt=ChatPromptTemplate.from_messages(
@@ -80,7 +83,7 @@ def create_data(description):
80
 
81
  ### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
82
  def res2(i):
83
- response=chain.invoke({"question" : df2['Date + Desc'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
84
  return response
85
  #### dj2 list contains all column values seperated by comma:
86
  vehicles=[]
@@ -124,7 +127,7 @@ def create_data(description):
124
  df2["Road_Characteristic"]=Road_Characteristic
125
  df2["Pedestrian_Involved"]=Pedestrian_Involved
126
  df2["Vehicles_involved"]=vehicles
127
- df3=df2.drop(columns=['Description','Date + Desc','Report Type'])
128
  return df3
129
 
130
 
 
15
  ### Set all api keys:
16
 
17
  #os.environ["LANGCHAIN_TRACING_V2"]="true" ### Will automatically trace our codes using Langsmith
18
+ #os.environ["GROQ_API_KEY"]="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" #### Will be used for monitoring the calls to and from llm (both free and paid)
19
+ os.environ["GROQ_API_KEY"]="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"
20
+
21
+
22
 
23
  ### Create Prompt Template:
24
  prompt=ChatPromptTemplate.from_messages(
 
83
 
84
  ### A function to invoke the llm. For some reason phi3 doesn't give accurate result sometimes if used directly in dj.append()
85
  def res2(i):
86
+ response=chain.invoke({"question" : df2['Description'][i]+" Only name the type of vehicles involved in the accident. If multiple vehicles are involved, seperate them by hyphens(-). Example answers: Bus, Truck-Bus etc. If no vehicles are mentioned, your answer will be: Not Available. Your answer should only contain the vehicle name, do not include any extra sentences"})
87
  return response
88
  #### dj2 list contains all column values seperated by comma:
89
  vehicles=[]
 
127
  df2["Road_Characteristic"]=Road_Characteristic
128
  df2["Pedestrian_Involved"]=Pedestrian_Involved
129
  df2["Vehicles_involved"]=vehicles
130
+ df3=df2.drop(columns=['Description','Report Type'])
131
  return df3
132
 
133
 
Prothom_alo_fully_scraped.py CHANGED
@@ -1,106 +1,79 @@
1
- def get_data(number):
2
- print("Running Prothom_alo_fully_scraped")
3
- ##Necessary imports
4
- from deep_translator import GoogleTranslator
5
- from selenium import webdriver
6
- from selenium.webdriver import chrome
7
- from selenium.webdriver import ChromeOptions
8
- from datetime import datetime, timedelta
9
- import re
10
- options = ChromeOptions()
11
- options.add_argument("enable-automation")
12
- options.add_argument("--window-size=1920,1080")
13
- options.add_argument("--no-sandbox")
14
- options.add_argument("--disable-extensions")
15
- options.add_argument("--dns-prefetch-disable")
16
- options.add_argument("--disable-gpu")
17
- options.add_argument("--headless=new")
18
- driver = webdriver.Chrome(options=options)
19
- ## Finding Elements by XPATH
20
- from selenium.webdriver.common.by import By
21
- import time, math
22
- driver.get("https://www.prothomalo.com/topic/%E0%A6%B8%E0%A7%9C%E0%A6%95-%E0%A6%A6%E0%A7%81%E0%A6%B0%E0%A7%8D%E0%A6%98%E0%A6%9F%E0%A6%A8%E0%A6%BE")
23
- time.sleep(15)
24
- news_list=[]
25
- news_link=[]
26
- publish_date=[]
27
- if number<=15:
28
- print("Entered number<15 condition")
29
- txt=driver.find_elements(By.CLASS_NAME, "title-link")
30
- date=driver.find_elements(By.TAG_NAME, "time")
31
- for i in range(number):
32
- news_list.append(txt[i].text)
33
- news_link.append(txt[i].get_attribute("href"))
34
- publish_date.append(date[i].text)
35
-
36
- else:
37
- clck=int((number-25)/15 + 2)
38
- for i in range(clck):
39
- print(i)
40
- time.sleep(10)
41
- last_height = driver.execute_script("return document.body.scrollHeight")
42
- driver.execute_script(f"window.scrollTo(0, {last_height-1050})")
43
- button=driver.find_elements(By.CLASS_NAME, "tNj8k")
44
- button[0].click()
45
- time.sleep(5)
46
- txt=driver.find_elements(By.CLASS_NAME, "title-link")
47
- date=driver.find_elements(By.TAG_NAME, "time")
48
- for i in range(number):
49
- news_list.append(txt[i].text)
50
- news_link.append(txt[i].get_attribute("href"))
51
- publish_date.append(date[i].text)
52
-
53
- ###### Scraping Description modified for translation######
54
- print("Description modified Translation block")
55
- text=[]
56
- for i in range (len(news_link)):
57
- print(i,news_link[i])
58
- driver.get(news_link[i])
59
- try:
60
- tmp=""
61
- elements = driver.find_elements(By.TAG_NAME, 'p')
62
- for i in range(len(elements)):
63
- if i>2 and len(tmp+elements[i].text) < 2000:
64
- tmp=tmp+elements[i].text
65
-
66
- text.append(tmp)
67
- except:
68
- text.append("Not Available")
69
- time.sleep(5)
70
- ## Translation
71
- print("Translation Started")
72
- for i in range(len(news_list)):
73
- news_list[i] = GoogleTranslator(source='auto', target='en').translate(text=news_list[i])
74
- text[i] = GoogleTranslator(source='auto', target='en').translate(text=text[i])
75
- publish_date[i] = GoogleTranslator(source='auto', target='en').translate(text=publish_date[i])
76
-
77
- #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
78
- dict={'News Title':news_list,'News Link':news_link,'Publish Date':publish_date, 'Description':text}
79
- import pandas as pd
80
- df=pd.DataFrame(dict)
81
- df2=df.copy()
82
-
83
-
84
- for p in range(len(df2)):
85
- if df2['Publish Date'][p]=="Not available":
86
- df2.drop([p],inplace=True)
87
- #df2.reset_index()
88
- df2["Date + Desc"]=df2["Publish Date"] + df2["Description"]
89
- df2.reset_index(drop=True,inplace=True)
90
- # Function to convert relative time to date and format as day-month-year
91
- def convert_relative_time_to_date(time_str):
92
- if 'hours ago' in time_str or 'hour ago' in time_str:
93
- hours = int(re.search(r'(\d+)', time_str).group(1))
94
- return (datetime.now() - timedelta(hours=hours)).strftime('%d-%m-%Y')
95
- elif 'days ago' in time_str or 'day ago' in time_str:
96
- days = int(re.search(r'(\d+)', time_str).group(1))
97
- return (datetime.now() - timedelta(days=days)).strftime('%d-%m-%Y')
98
- else:
99
- # If it's already a date string, return it in day-month-year format
100
- return pd.to_datetime(time_str).strftime('%d-%m-%Y')
101
-
102
- # Apply the function to the DataFrame
103
- df2['Publish Date'] = df2['Publish Date'].apply(convert_relative_time_to_date)
104
-
105
- return df2
106
- #df3.to_csv('Prothom_Alo_Description.txt', index=False)
 
1
+ def get_data(number):
2
+ print("Running Prothom_alo_fully_scraped")
3
+ ##Necessary imports
4
+ from deep_translator import GoogleTranslator
5
+ from selenium import webdriver
6
+ from selenium.webdriver import chrome
7
+ from selenium.webdriver import ChromeOptions
8
+ from datetime import datetime, timedelta
9
+ import re
10
+ #PROXY = "45.251.231.113:5678"
11
+ options = ChromeOptions()
12
+ #options.add_argument('--proxy-server=%s' % PROXY)
13
+ #options.add_argument("--headless=new")
14
+ driver = webdriver.Chrome(options=options)
15
+ ## Finding Elements by XPATH
16
+ from selenium.webdriver.common.by import By
17
+ import time, math
18
+ driver.get("https://www.prothomalo.com/topic/%E0%A6%B8%E0%A7%9C%E0%A6%95-%E0%A6%A6%E0%A7%81%E0%A6%B0%E0%A7%8D%E0%A6%98%E0%A6%9F%E0%A6%A8%E0%A6%BE")
19
+ time.sleep(15)
20
+ news_list=[]
21
+ news_link=[]
22
+ publish_date=[]
23
+ if number<=15:
24
+ txt=driver.find_elements(By.CLASS_NAME, "title-link")
25
+ date=driver.find_elements(By.TAG_NAME, "time")
26
+ for i in range(number):
27
+ news_list.append(txt[i].text)
28
+ news_link.append(txt[i].get_attribute("href"))
29
+ publish_date.append(date[i].text)
30
+
31
+ else:
32
+ clck=int((number-25)/15 + 2)
33
+ for i in range(clck):
34
+ print(i)
35
+ time.sleep(10)
36
+ last_height = driver.execute_script("return document.body.scrollHeight")
37
+ driver.execute_script(f"window.scrollTo(0, {last_height-1050})")
38
+ button=driver.find_elements(By.CLASS_NAME, "tNj8k")
39
+ button[0].click()
40
+ time.sleep(5)
41
+ txt=driver.find_elements(By.CLASS_NAME, "title-link")
42
+ date=driver.find_elements(By.TAG_NAME, "time")
43
+ for i in range(number):
44
+ news_list.append(txt[i].text)
45
+ news_link.append(txt[i].get_attribute("href"))
46
+ publish_date.append(date[i].text)
47
+
48
+ ###### Scraping Description modified for translation######
49
+ from deep_translator import GoogleTranslator
50
+ from goose3 import Goose
51
+ from datetime import datetime
52
+ g = Goose()
53
+ description=[]
54
+ News_title=[]
55
+ publish_date=[]
56
+ for i in range(len(news_link)):
57
+ print(i)
58
+ article = g.extract(url=news_link[i])
59
+ ### Only for Prothom Alo ###
60
+ # Access the articleBody field
61
+ data = article.schema
62
+ article_body = data.get('articleBody')
63
+ print("Para length", len(article_body))
64
+ if(len(article_body)>=2200):
65
+ article_body=article_body[0:2200]
66
+ bangla_title=article.title
67
+ english_title = GoogleTranslator(source='auto', target='en').translate(text=bangla_title)
68
+ News_title.append(english_title)
69
+ text = GoogleTranslator(source='auto', target='en').translate(text=article_body)
70
+ description.append(text)
71
+ publish_date.append(article.publish_date)
72
+ # Convert the dates to "day-month-year" format
73
+ formatted_dates = [datetime.fromisoformat(date).strftime('%d-%m-%Y') for date in publish_date]
74
+
75
+ #### Converting the list to a pandas dataframe by converting the list to a dictionary ###
76
+ dict={'News Title':News_title,'News Link':news_link,'Publish Date':formatted_dates, 'Description':description}
77
+ import pandas as pd
78
+ df=pd.DataFrame(dict)
79
+ return df