soso-11 commited on
Commit
75fa9d1
·
verified ·
1 Parent(s): cd06a5d

Upload 9 files

Browse files
Files changed (9) hide show
  1. .env +9 -0
  2. app.py +106 -0
  3. config.json +54 -0
  4. economy_data_en.csv +23 -0
  5. file_id_mapping.json +1 -0
  6. google_drive_handle.py +64 -0
  7. hespress_ar.py +151 -0
  8. hespress_en.py +124 -0
  9. hespress_fr.py +151 -0
.env ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ CLIENT_ID=340585044850-o53v0f2eam1is8f50jk4v9jvl4fskasi.apps.googleusercontent.com
2
+ PROJECT_ID=moroccan-news-file-share
3
+ AUTH_URI=https://accounts.google.com/o/oauth2/auth
4
+ TOKEN_URI=https://oauth2.googleapis.com/token
5
+ AUTH_PROVIDER_X509_CERT_URL=https://www.googleapis.com/oauth2/v1/certs
6
+ CLIENT_SECRET=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
7
+ REFRESH_TOKEN=1//04FPKbmAI7iv3CgYIARAAGAQSNwF-L9IrFAlZLzJUUf6c3OYtbJhGm_OO9oa3RW4WuNdDWRIZ8QmjdWI083D7HXxmKj8xPe4gA7w
8
+ REDIRECT_URIS=https://developers.google.com/oauthplayground,http://localhost:8501,http://localhost:8080
9
+
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #web interface
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import json
6
+ import importlib
7
+ import google_drive_handle as gdrive
8
+ from dotenv import load_dotenv
9
+ import os
10
+
11
+ # Load config.json
12
+ with open('config.json') as f:
13
+ config = json.load(f)
14
+
15
+ drive = gdrive.authenticate_google_drive()
16
+ processed_files = set()
17
+ st.markdown(
18
+ """
19
+ <style>
20
+ .centered {
21
+ display: flex;
22
+ align-items: center;
23
+ justify-content: center;
24
+ text-align: center;
25
+ }
26
+ </style>
27
+ """,
28
+ unsafe_allow_html=True
29
+ )
30
+
31
+ st.markdown("<h1 class='centered'>Moroccan News Aggregator</h1>", unsafe_allow_html=True)
32
+
33
+ selected_websites = {}
34
+ selected_categories = {}
35
+
36
+ def save_file_id_mapping(file_id_mapping):
37
+ with open("file_id_mapping.json", "w") as file:
38
+ json.dump(file_id_mapping, file)
39
+
40
+ def load_file_id_mapping():
41
+ try:
42
+ with open("file_id_mapping.json", "r") as file:
43
+ return json.load(file)
44
+ except FileNotFoundError:
45
+ return {} # Return an empty dictionary if the file doesn't exist
46
+
47
+ file_id_mapping = load_file_id_mapping()
48
+
49
+ for website, details in config.items():
50
+ if st.checkbox(website, key=website):
51
+ # Language selection
52
+ languages = details.get("languages", {})
53
+ if languages and len(languages) > 1:
54
+ language = st.selectbox(f'Choose language for {website}', list(languages.keys()), key=f'lang_{website}')
55
+ selected_websites[website] = f"{website}_{language}" # like: hespress_en
56
+ else:
57
+ selected_websites[website] = website # like: akhbarona
58
+
59
+ # Category selection
60
+ categories = languages.get(language, {})
61
+ if categories:
62
+ categories = st.multiselect(f'Select categories for {website}', list(categories.keys()), key=f'{website}_categories')
63
+ selected_categories[website] = categories
64
+
65
+ # Number of articles input
66
+ num_articles = st.number_input('Number of Articles', min_value=1, max_value=10000, step=1)
67
+
68
+ # Start scraping button
69
+ if st.button('Start Scraping'):
70
+ with st.spinner('Scraping in progress...'):
71
+ progress_bar = st.progress(0)
72
+ total_tasks = sum(len(categories) for categories in selected_categories.values())
73
+ completed_tasks = 0
74
+ for website, module_name in selected_websites.items():
75
+ scraper_module = importlib.import_module(module_name)
76
+ for category in selected_categories.get(website, []):
77
+ category_url = config[website]['languages'][language][category]
78
+
79
+ file_path = scraper_module.scrape_category(category_url, num_articles)
80
+
81
+ if file_path:
82
+ if file_path not in file_id_mapping:
83
+ file_id = gdrive.upload_file_to_drive(drive, file_path)
84
+ print(f"Uploading file: {file_path}, File ID: {file_id}")
85
+ file_id_mapping[file_path] = file_id
86
+ save_file_id_mapping(file_id_mapping)
87
+ else:
88
+ file_id = file_id_mapping[file_path]
89
+ print(f"File already uploaded. Using existing File ID: {file_id}")
90
+
91
+ if file_id:
92
+ download_link = gdrive.get_drive_download_link(drive, file_id)
93
+ if download_link:
94
+ #st.markdown(f"[Download {website} - {category} data]({download_link})", unsafe_allow_html=True)
95
+
96
+ df = pd.read_csv(file_path)
97
+ st.write(f"{website} - {category} Data:")
98
+ st.dataframe(df)
99
+ else:
100
+ st.error(f"Failed to retrieve download link for file ID: {file_id}")
101
+ else:
102
+ st.error(f"Failed to upload file for {website} - {category}")
103
+ else:
104
+ st.error(f"File not created for {website} - {category}")
105
+
106
+ st.success('Scraping Completed!')
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hespress": {
3
+ "languages": {
4
+ "en": {
5
+ "politics": "https://en.hespress.com/politics",
6
+ "economy": "https://en.hespress.com/economy",
7
+ "society": "https://en.hespress.com/society",
8
+ "culture": "https://en.hespress.com/culture",
9
+ "sports": "https://en.hespress.com/sports",
10
+ "mena": "https://en.hespress.com/mena",
11
+ "international": "https://en.hespress.com/international"
12
+ },
13
+ "ar": {
14
+ "Politique": "https://www.hespress.com/politique",
15
+ "Economie": "https://www.hespress.com/economie",
16
+ "Tamazight": "https://www.hespress.com/tamazight",
17
+ "Sport": "https://www.hespress.com/sport",
18
+ "Sociètè": "https://www.hespress.com/societe",
19
+ "Culture": "https://www.hespress.com/art-et-culture",
20
+ "Mèdias": "https://www.hespress.com/medias",
21
+ "faits-divers": "https://www.hespress.com/faits-divers",
22
+ "Automoto": "https://www.hespress.com/automoto",
23
+ "Regions": "https://www.hespress.com/regions"
24
+ },
25
+ "fr": {
26
+ "Politique": "https://fr.hespress.com/politique",
27
+ "Economie": "https://fr.hespress.com/economie",
28
+ "Monde": "https://fr.hespress.com/monde",
29
+ "Sport": "https://fr.hespress.com/sport",
30
+ "Sociètè": "https://fr.hespress.com/societe",
31
+ "Culture": "https://fr.hespress.com/culture",
32
+ "Mèdias": "https://fr.hespress.com/media",
33
+ "High-tech": "https://fr.hespress.com/high-tech",
34
+ "Opinions": "https://fr.hespress.com/opinions",
35
+ "Regions": "https://fr.hespress.com/regions"
36
+ }
37
+ },
38
+ "module": "hespress"
39
+ },
40
+ "akhbarona": {
41
+ "languages": {
42
+ "ar": {},
43
+ "fr": {}
44
+ },
45
+ "module": "akhbarona"
46
+ },
47
+ "le360": {
48
+ "languages": {
49
+ "ar": {},
50
+ "fr": {}
51
+ },
52
+ "module": "le360"
53
+ }
54
+ }
economy_data_en.csv ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Title,Date,Category,Content,Link,Image
2
+ Forum Mines Rabat Entreprises fosters discussions on Morocco's industrial sovereignty,Thursday 8 February 2024 - 10:40,economy,"The 24th edition of “Forum Mines Rabat Entreprises,” hosted by the Forum Committee of Rabat’s National School of Mines on February 6-7 to, serving as an opportunity to discuss the key role of expertise in safeguarding Morocco’s industrial sovereignty.
3
+ Held under the theme of “The Kingdom’s New Industrial Strategy: Support for Innovation and Realization of Moroccan Sovereignty,” the forum brought together different experts from the industrial sector to highlight the importance of innovation and explore ways to strengthen Morocco’s industrial sovereignty.
4
+ Among them were Ayoub Daoudi, the General Manager of TE Connectivity North Africa, Laurent Figari, the General Manager of Safran Electronics & Defense, and Abderrazak Ben Saga, Head of the Information and Guidance Division and National Coordination of Career Centers.
5
+
6
+
7
+
8
+
9
+ In this regard, Daoudi highlighted the two trends and essential drivers that shape all technological evolutions: “all green and all connected.”
10
+ The notion of “all connected” signifies the rise in the presence of artificial intelligence and electronics, explained the expert, leading to a surge in the adoption of automation.
11
+ This connectivity implies that the level of reliability will play an increasingly vital role, he underscored.
12
+ This transformative shift holds inevitable implications for both the industrial sector and professionals, namely engineers and technicians of tomorrow.
13
+ “For this reason, the demand for a higher level of expertise will continue to grow,” underscored the General Manager of TE Connectivity North Africa.
14
+ Speaking of the two essential elements of industrial sovereignty, Daoudi highlighted that “while capital holds its significance, it is the expertise and know-how that play a truly essential role in ensuring industrial sovereignty.”
15
+ They are key elements for building and securing industrial sovereignty from one generation to the next, he stressed.
16
+ Daoudi emphasized the vital need to align our educational system with technological skills in today’s environment. He believes that expertise and technical proficiency are interdependent and inseparable.
17
+ “We no longer need engineers who know everything but master nothing. Instead, a combination of both profiles is necessary,” he stressed.
18
+ The General Manager of TE Connectivity North Africa also acknowledged the fact that Morocco places a high priority on developing knowledge and expertise in all sectors that are pivotal for achieving industrial sovereignty.
19
+ “This technological aspect will enable us to build our ecosystem and adapt to macroeconomic geopolitical changes worldwide,” noted Daoudi.
20
+ On the second day of the forum, the event also featured the participation of the Minister of Industry and Commerce, Ryad Mezzour. His address centered around the Kingdom’s industrial strategy, adding a political and strategic dimension to the forum.
21
+ The 24th edition of “Forum Mines Rabat Entreprises,” served as a platform to enhance interaction between students and the job market by building connections with various national and international companies.
22
+ This program provided an important opportunity for students from many professions to make a more effective transition into professional life.
23
+ It fostered an environment suitable for sharing knowledge and exploring internship and career opportunities, bridging the gap between academia and industry.",https://en.hespress.com/79478-forum-mines-rabat-entreprises-fosters-discussions-on-moroccos-industrial-sovereignty.html,https://e1.hespress.com/wp-content/uploads/2024/02/WhatsApp-Image-2024-02-08-at-20.23.17-scaled-e1707422272796-900x600.jpeg
file_id_mapping.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"/home/tajeddine/Desktop/news-scraper/faits-divers_data_ar.csv": "1Bl8U3cDG7dNHD4qmWI6oSwyBHo_6-ZEk", "/home/tajeddine/Desktop/news-scraper/art-et-culture_data_ar.csv": "1a_QbvGU04AQ2jGWTNUJ8140iK6oAtrrk", "C:\\Users\\Lenovo\\Documents\\MDS projects\\news-scraper_updated\\v4\\news-scraper\\economy_data_en.csv": null}
google_drive_handle.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from pydrive.auth import GoogleAuth
3
+ from pydrive.drive import GoogleDrive
4
+ from oauth2client.client import OAuth2Credentials
5
+ import os
6
+
7
+ load_dotenv()
8
+
9
+ CLIENT_ID = os.getenv('CLIENT_ID')
10
+ CLIENT_SECRET = os.getenv('CLIENT_SECRET')
11
+ REFRESH_TOKEN = os.getenv('REFRESH_TOKEN')
12
+ REDIRECT_URI = os.getenv('REDIRECT_URIS').split(',')[0] # Access the first URI
13
+
14
+ def authenticate_google_drive():
15
+ gauth = GoogleAuth()
16
+ gauth.credentials = OAuth2Credentials(None, CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN, None,
17
+ "https://accounts.google.com/o/oauth2/token", None, "web")
18
+ drive = GoogleDrive(gauth)
19
+ return drive
20
+
21
+ drive = authenticate_google_drive()
22
+
23
+ def upload_file_to_drive(drive, file_path, folder_id=None):
24
+ if not os.path.exists(file_path):
25
+ print(f"Cannot upload, file does not exist at path: {file_path}")
26
+ return None
27
+
28
+ try:
29
+ file_metadata = {'title': os.path.basename(file_path)}
30
+ if folder_id:
31
+ file_metadata['parents'] = [{'id': folder_id}]
32
+
33
+ upload_file = drive.CreateFile(file_metadata)
34
+
35
+ # Check if the file already exists on Google Drive
36
+ existing_files = drive.ListFile({'q': f"title='{upload_file['title']}'"}).GetList()
37
+ if existing_files:
38
+ # File with the same name already exists, update the existing file
39
+ upload_file = existing_files[0]
40
+ print(f"File already exists on Drive. Updating file with ID: {upload_file['id']}")
41
+ else:
42
+ print("Uploading a new file to Drive.")
43
+
44
+ upload_file.SetContentFile(file_path)
45
+ upload_file.Upload()
46
+ print(f"File uploaded successfully. File ID: {upload_file['id']}")
47
+ return upload_file['id']
48
+ except Exception as e:
49
+ print(f"An error occurred during file upload: {e}")
50
+ return None
51
+
52
+
53
+ def get_drive_download_link(drive, file_id):
54
+ try:
55
+ file = drive.CreateFile({'id': file_id})
56
+ file.Upload() # Make sure the file exists on Drive
57
+ file.InsertPermission({
58
+ 'type': 'anyone',
59
+ 'value': 'anyone',
60
+ 'role': 'reader'})
61
+ return "https://drive.google.com/uc?export=download&id=" + file_id
62
+ except Exception as e:
63
+ print(f"Error fetching download link: {e}")
64
+ return None
hespress_ar.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from webdriver_manager.chrome import ChromeDriverManager
3
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.common.exceptions import TimeoutException
8
+ from bs4 import BeautifulSoup
9
+ import time
10
+ import re
11
+ import os
12
+ import requests
13
+ import csv
14
+ from urllib.parse import urljoin
15
+ from google_drive_handle import authenticate_google_drive
16
+ drive = authenticate_google_drive()
17
+
18
+
19
+ # Set up Chrome WebDriver with options
20
+ options = ChromeOptions()
21
+ options.add_argument('--headless')
22
+ options.add_argument('--no-sandbox')
23
+ options.add_argument('--disable-dev-shm-usage')
24
+
25
+ # Initialize the Chrome WebDriver
26
+ wd = webdriver.Chrome(options=options)
27
+
28
+ def download_image(img_url):
29
+ return img_url
30
+
31
+ def scroll_page(expected_article_count):
32
+ scroll_pause_time = 2
33
+ screen_height = wd.execute_script("return window.innerHeight;")
34
+ scrolled_height = 0
35
+
36
+ while True:
37
+ scrolled_height += screen_height
38
+ wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
39
+ time.sleep(scroll_pause_time)
40
+ new_height = wd.execute_script("return document.body.scrollHeight")
41
+ if scrolled_height >= new_height:
42
+ break
43
+
44
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
45
+ articles = soup.find_all('div', class_='overlay card')
46
+ if len(articles) >= expected_article_count:
47
+ break
48
+
49
+ def scrape_article_details(article_url):
50
+ try:
51
+ wd.get(article_url)
52
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
53
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
54
+ content_tag = soup.find('div', class_='article-content')
55
+ content = content_tag.get_text().strip() if content_tag else ""
56
+ date_tag = soup.find('small', class_='text-muted time')
57
+ date = date_tag.get_text().strip() if date_tag else ""
58
+ image_tag = soup.find('img', class_='wp-post-image')
59
+ image_url = image_tag['src'] if image_tag else None
60
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
61
+ return content, date, img_url
62
+ except TimeoutException:
63
+ print("Timed out waiting for page elements to load")
64
+ return "", "", None
65
+ except Exception as e:
66
+ print(f"An error occurred while scraping article details: {str(e)}")
67
+ return "", "", None
68
+
69
+ def scrape_article_details(article_url):
70
+
71
+ try:
72
+ wd.get(article_url)
73
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
74
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
75
+
76
+ content_tag = soup.find('div', class_='article-content')
77
+ content = content_tag.get_text().strip() if content_tag else ""
78
+
79
+ date_tag = soup.find('small', class_='text-muted time')
80
+ date = date_tag.get_text().strip() if date_tag else ""
81
+
82
+ image_tag = soup.find('img', class_='wp-post-image')
83
+ image_url = image_tag['src'] if image_tag else None
84
+
85
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
86
+
87
+ return content, date, img_url
88
+
89
+ except TimeoutException:
90
+ print("Timed out waiting for page elements to load")
91
+ return "", "", None, ""
92
+ except Exception as e:
93
+ print(f"An error occurred while scraping article details: {str(e)}")
94
+ return "", "", None, ""
95
+
96
+ def sanitize_filename(filename):
97
+ return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
98
+
99
+ def scrape_category(category_url, num_articles):
100
+ print("Attempting to scrape:", category_url)
101
+ articles_data = []
102
+ wd.get(category_url)
103
+ scroll_page(num_articles)
104
+
105
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
106
+ articles = soup.find_all('div', class_='overlay card')
107
+ for article in articles[:num_articles]:
108
+ link_tag = article.find('a', class_='stretched-link')
109
+ link = link_tag['href'] if link_tag else ""
110
+ title_tag = article.find('h3', class_='card-title')
111
+ title = title_tag.get_text().strip() if title_tag else ""
112
+ content, date, img_url = scrape_article_details(link)
113
+ article_data = {
114
+ "Title": title,
115
+ "Date": date,
116
+ "Category": category_url.split('/')[-1],
117
+ "Content": content,
118
+ "Link": link,
119
+ "Image": img_url
120
+ }
121
+ print(f"Scraping article: {title}, Link: {link}")
122
+ articles_data.append(article_data)
123
+
124
+
125
+ # Save scraped data to a CSV file
126
+ category_name = sanitize_filename(category_url.split("/")[-1])
127
+ csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_ar.csv')
128
+ file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
129
+
130
+ try:
131
+ with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
132
+ fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
133
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
134
+ if file_mode == 'w':
135
+ writer.writeheader()
136
+ for article in articles_data:
137
+ writer.writerow(article)
138
+ print(f"CSV file saved successfully at {csv_file_path}")
139
+ except IOError as e:
140
+ print(f"Failed to save file at {csv_file_path}: {e}")
141
+ return None # Return None to indicate failure
142
+
143
+ # Check if the file exists before uploading
144
+
145
+ if os.path.exists(csv_file_path):
146
+ print(f"File successfully created at {csv_file_path}")
147
+ return csv_file_path
148
+
149
+ else:
150
+ print(f"Failed to create file for {category_url}")
151
+ return None
hespress_en.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from webdriver_manager.chrome import ChromeDriverManager
3
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.common.exceptions import TimeoutException
8
+ from bs4 import BeautifulSoup
9
+ import time
10
+ import re
11
+ import os
12
+ import requests
13
+ import csv
14
+ from urllib.parse import urljoin
15
+ from google_drive_handle import authenticate_google_drive
16
+ drive = authenticate_google_drive()
17
+
18
+
19
+ # Set up Chrome WebDriver with options
20
+ options = ChromeOptions()
21
+ options.add_argument('--headless')
22
+ options.add_argument('--no-sandbox')
23
+ options.add_argument('--disable-dev-shm-usage')
24
+
25
+
26
+ wd = webdriver.Chrome(options=options)
27
+
28
+
29
+ def download_image(img_url):
30
+ return img_url
31
+
32
+ def scroll_page(expected_article_count):
33
+ scroll_pause_time = 2
34
+ screen_height = wd.execute_script("return window.innerHeight;")
35
+ scrolled_height = 0
36
+
37
+ while True:
38
+ scrolled_height += screen_height
39
+ wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
40
+ time.sleep(scroll_pause_time)
41
+ new_height = wd.execute_script("return document.body.scrollHeight")
42
+ if scrolled_height >= new_height:
43
+ break
44
+
45
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
46
+ articles = soup.find_all('div', class_='overlay card')
47
+ if len(articles) >= expected_article_count:
48
+ break
49
+
50
+ def scrape_article_details(article_url):
51
+ try:
52
+ wd.get(article_url)
53
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
54
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
55
+ content_tag = soup.find('div', class_='article-content')
56
+ content = content_tag.get_text().strip() if content_tag else ""
57
+ date_tag = soup.find('small', class_='text-muted time')
58
+ date = date_tag.get_text().strip() if date_tag else ""
59
+ image_tag = soup.find('img', class_='wp-post-image')
60
+ image_url = image_tag['src'] if image_tag else None
61
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
62
+ return content, date, img_url
63
+ except TimeoutException:
64
+ print("Timed out waiting for page elements to load")
65
+ return "", "", None
66
+ except Exception as e:
67
+ print(f"An error occurred while scraping article details: {str(e)}")
68
+ return "", "", None
69
+
70
+ def sanitize_filename(filename):
71
+ return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
72
+
73
+ def scrape_category(category_url, num_articles):
74
+ print("Attempting to scrape:", category_url)
75
+ articles_data = []
76
+ wd.get(category_url)
77
+ scroll_page(num_articles)
78
+
79
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
80
+ articles = soup.find_all('div', class_='overlay card')
81
+ for article in articles[:num_articles]:
82
+ link_tag = article.find('a', class_='stretched-link')
83
+ link = link_tag['href'] if link_tag else ""
84
+ title_tag = article.find('h3', class_='card-title')
85
+ title = title_tag.get_text().strip() if title_tag else ""
86
+ content, date, img_url = scrape_article_details(link)
87
+ article_data = {
88
+ "Title": title,
89
+ "Date": date,
90
+ "Category": category_url.split('/')[-1],
91
+ "Content": content,
92
+ "Link": link,
93
+ "Image": img_url
94
+ }
95
+ print(f"Scraping article: {title}, Link: {link}")
96
+ articles_data.append(article_data)
97
+
98
+ # Save scraped data to a CSV file
99
+ category_name = sanitize_filename(category_url.split("/")[-1])
100
+ csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
101
+ file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
102
+
103
+ try:
104
+ with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
105
+ fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
106
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
107
+ if file_mode == 'w':
108
+ writer.writeheader()
109
+ for article in articles_data:
110
+ writer.writerow(article)
111
+ print(f"CSV file saved successfully at {csv_file_path}")
112
+ except IOError as e:
113
+ print(f"Failed to save file at {csv_file_path}: {e}")
114
+ return None # Return None to indicate failure
115
+
116
+ # Check if the file exists before uploading
117
+
118
+ if os.path.exists(csv_file_path):
119
+ print(f"File successfully created at {csv_file_path}")
120
+ return csv_file_path
121
+
122
+ else:
123
+ print(f"Failed to create file for {category_url}")
124
+ return None
hespress_fr.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from webdriver_manager.chrome import ChromeDriverManager
3
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.common.exceptions import TimeoutException
8
+ from bs4 import BeautifulSoup
9
+ import time
10
+ import re
11
+ import os
12
+ import requests
13
+ import csv
14
+ from urllib.parse import urljoin
15
+ from google_drive_handle import authenticate_google_drive
16
+ drive = authenticate_google_drive()
17
+
18
+
19
+ # Set up Chrome WebDriver with options
20
+ options = ChromeOptions()
21
+ options.add_argument('--headless')
22
+ options.add_argument('--no-sandbox')
23
+ options.add_argument('--disable-dev-shm-usage')
24
+
25
+ # Initialize the Chrome WebDriver
26
+ wd = webdriver.Chrome(options=options)
27
+
28
+ def download_image(img_url):
29
+ return img_url
30
+
31
+ def scroll_page(expected_article_count):
32
+ scroll_pause_time = 2
33
+ screen_height = wd.execute_script("return window.innerHeight;")
34
+ scrolled_height = 0
35
+
36
+ while True:
37
+ scrolled_height += screen_height
38
+ wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
39
+ time.sleep(scroll_pause_time)
40
+ new_height = wd.execute_script("return document.body.scrollHeight")
41
+ if scrolled_height >= new_height:
42
+ break
43
+
44
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
45
+ articles = soup.find_all('div', class_='overlay card')
46
+ if len(articles) >= expected_article_count:
47
+ break
48
+
49
+ def scrape_article_details(article_url):
50
+ try:
51
+ wd.get(article_url)
52
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
53
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
54
+ content_tag = soup.find('div', class_='article-content')
55
+ content = content_tag.get_text().strip() if content_tag else ""
56
+ date_tag = soup.find('small', class_='text-muted time')
57
+ date = date_tag.get_text().strip() if date_tag else ""
58
+ image_tag = soup.find('img', class_='wp-post-image')
59
+ image_url = image_tag['src'] if image_tag else None
60
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
61
+ return content, date, img_url
62
+ except TimeoutException:
63
+ print("Timed out waiting for page elements to load")
64
+ return "", "", None
65
+ except Exception as e:
66
+ print(f"An error occurred while scraping article details: {str(e)}")
67
+ return "", "", None
68
+
69
+ def scrape_article_details(article_url):
70
+
71
+ try:
72
+ wd.get(article_url)
73
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
74
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
75
+
76
+ content_tag = soup.find('div', class_='article-content')
77
+ content = content_tag.get_text().strip() if content_tag else ""
78
+
79
+ date_tag = soup.find('small', class_='text-muted time')
80
+ date = date_tag.get_text().strip() if date_tag else ""
81
+
82
+ image_tag = soup.find('img', class_='wp-post-image')
83
+ image_url = image_tag['src'] if image_tag else None
84
+
85
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
86
+
87
+ return content, date, img_url
88
+
89
+ except TimeoutException:
90
+ print("Timed out waiting for page elements to load")
91
+ return "", "", None, ""
92
+ except Exception as e:
93
+ print(f"An error occurred while scraping article details: {str(e)}")
94
+ return "", "", None, ""
95
+
96
+ def sanitize_filename(filename):
97
+ return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
98
+
99
+ def scrape_category(category_url, num_articles):
100
+ print("Attempting to scrape:", category_url)
101
+ articles_data = []
102
+ wd.get(category_url)
103
+ scroll_page(num_articles)
104
+
105
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
106
+ articles = soup.find_all('div', class_='overlay card')
107
+ for article in articles[:num_articles]:
108
+ link_tag = article.find('a', class_='stretched-link')
109
+ link = link_tag['href'] if link_tag else ""
110
+ title_tag = article.find('h3', class_='card-title')
111
+ title = title_tag.get_text().strip() if title_tag else ""
112
+ content, date, img_url = scrape_article_details(link)
113
+ article_data = {
114
+ "Title": title,
115
+ "Date": date,
116
+ "Category": category_url.split('/')[-1],
117
+ "Content": content,
118
+ "Link": link,
119
+ "Image": img_url
120
+ }
121
+ print(f"Scraping article: {title}, Link: {link}")
122
+ articles_data.append(article_data)
123
+
124
+
125
+ # Save scraped data to a CSV file
126
+ category_name = sanitize_filename(category_url.split("/")[-1])
127
+ csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_fr.csv')
128
+ file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
129
+
130
+ try:
131
+ with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
132
+ fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
133
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
134
+ if file_mode == 'w':
135
+ writer.writeheader()
136
+ for article in articles_data:
137
+ writer.writerow(article)
138
+ print(f"CSV file saved successfully at {csv_file_path}")
139
+ except IOError as e:
140
+ print(f"Failed to save file at {csv_file_path}: {e}")
141
+ return None # Return None to indicate failure
142
+
143
+ # Check if the file exists before uploading
144
+
145
+ if os.path.exists(csv_file_path):
146
+ print(f"File successfully created at {csv_file_path}")
147
+ return csv_file_path
148
+
149
+ else:
150
+ print(f"Failed to create file for {category_url}")
151
+ return None