Upload 9 files
Browse files- .env +9 -0
- app.py +106 -0
- config.json +54 -0
- economy_data_en.csv +23 -0
- file_id_mapping.json +1 -0
- google_drive_handle.py +64 -0
- hespress_ar.py +151 -0
- hespress_en.py +124 -0
- hespress_fr.py +151 -0
.env
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CLIENT_ID=340585044850-o53v0f2eam1is8f50jk4v9jvl4fskasi.apps.googleusercontent.com
|
2 |
+
PROJECT_ID=moroccan-news-file-share
|
3 |
+
AUTH_URI=https://accounts.google.com/o/oauth2/auth
|
4 |
+
TOKEN_URI=https://oauth2.googleapis.com/token
|
5 |
+
AUTH_PROVIDER_X509_CERT_URL=https://www.googleapis.com/oauth2/v1/certs
|
6 |
+
CLIENT_SECRET=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
|
7 |
+
REFRESH_TOKEN=1//04FPKbmAI7iv3CgYIARAAGAQSNwF-L9IrFAlZLzJUUf6c3OYtbJhGm_OO9oa3RW4WuNdDWRIZ8QmjdWI083D7HXxmKj8xPe4gA7w
|
8 |
+
REDIRECT_URIS=https://developers.google.com/oauthplayground,http://localhost:8501,http://localhost:8080
|
9 |
+
|
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#web interface
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import json
|
6 |
+
import importlib
|
7 |
+
import google_drive_handle as gdrive
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import os
|
10 |
+
|
11 |
+
# Load config.json
|
12 |
+
with open('config.json') as f:
|
13 |
+
config = json.load(f)
|
14 |
+
|
15 |
+
drive = gdrive.authenticate_google_drive()
|
16 |
+
processed_files = set()
|
17 |
+
st.markdown(
|
18 |
+
"""
|
19 |
+
<style>
|
20 |
+
.centered {
|
21 |
+
display: flex;
|
22 |
+
align-items: center;
|
23 |
+
justify-content: center;
|
24 |
+
text-align: center;
|
25 |
+
}
|
26 |
+
</style>
|
27 |
+
""",
|
28 |
+
unsafe_allow_html=True
|
29 |
+
)
|
30 |
+
|
31 |
+
st.markdown("<h1 class='centered'>Moroccan News Aggregator</h1>", unsafe_allow_html=True)
|
32 |
+
|
33 |
+
selected_websites = {}
|
34 |
+
selected_categories = {}
|
35 |
+
|
36 |
+
def save_file_id_mapping(file_id_mapping):
|
37 |
+
with open("file_id_mapping.json", "w") as file:
|
38 |
+
json.dump(file_id_mapping, file)
|
39 |
+
|
40 |
+
def load_file_id_mapping():
|
41 |
+
try:
|
42 |
+
with open("file_id_mapping.json", "r") as file:
|
43 |
+
return json.load(file)
|
44 |
+
except FileNotFoundError:
|
45 |
+
return {} # Return an empty dictionary if the file doesn't exist
|
46 |
+
|
47 |
+
file_id_mapping = load_file_id_mapping()
|
48 |
+
|
49 |
+
for website, details in config.items():
|
50 |
+
if st.checkbox(website, key=website):
|
51 |
+
# Language selection
|
52 |
+
languages = details.get("languages", {})
|
53 |
+
if languages and len(languages) > 1:
|
54 |
+
language = st.selectbox(f'Choose language for {website}', list(languages.keys()), key=f'lang_{website}')
|
55 |
+
selected_websites[website] = f"{website}_{language}" # like: hespress_en
|
56 |
+
else:
|
57 |
+
selected_websites[website] = website # like: akhbarona
|
58 |
+
|
59 |
+
# Category selection
|
60 |
+
categories = languages.get(language, {})
|
61 |
+
if categories:
|
62 |
+
categories = st.multiselect(f'Select categories for {website}', list(categories.keys()), key=f'{website}_categories')
|
63 |
+
selected_categories[website] = categories
|
64 |
+
|
65 |
+
# Number of articles input
|
66 |
+
num_articles = st.number_input('Number of Articles', min_value=1, max_value=10000, step=1)
|
67 |
+
|
68 |
+
# Start scraping button
|
69 |
+
if st.button('Start Scraping'):
|
70 |
+
with st.spinner('Scraping in progress...'):
|
71 |
+
progress_bar = st.progress(0)
|
72 |
+
total_tasks = sum(len(categories) for categories in selected_categories.values())
|
73 |
+
completed_tasks = 0
|
74 |
+
for website, module_name in selected_websites.items():
|
75 |
+
scraper_module = importlib.import_module(module_name)
|
76 |
+
for category in selected_categories.get(website, []):
|
77 |
+
category_url = config[website]['languages'][language][category]
|
78 |
+
|
79 |
+
file_path = scraper_module.scrape_category(category_url, num_articles)
|
80 |
+
|
81 |
+
if file_path:
|
82 |
+
if file_path not in file_id_mapping:
|
83 |
+
file_id = gdrive.upload_file_to_drive(drive, file_path)
|
84 |
+
print(f"Uploading file: {file_path}, File ID: {file_id}")
|
85 |
+
file_id_mapping[file_path] = file_id
|
86 |
+
save_file_id_mapping(file_id_mapping)
|
87 |
+
else:
|
88 |
+
file_id = file_id_mapping[file_path]
|
89 |
+
print(f"File already uploaded. Using existing File ID: {file_id}")
|
90 |
+
|
91 |
+
if file_id:
|
92 |
+
download_link = gdrive.get_drive_download_link(drive, file_id)
|
93 |
+
if download_link:
|
94 |
+
#st.markdown(f"[Download {website} - {category} data]({download_link})", unsafe_allow_html=True)
|
95 |
+
|
96 |
+
df = pd.read_csv(file_path)
|
97 |
+
st.write(f"{website} - {category} Data:")
|
98 |
+
st.dataframe(df)
|
99 |
+
else:
|
100 |
+
st.error(f"Failed to retrieve download link for file ID: {file_id}")
|
101 |
+
else:
|
102 |
+
st.error(f"Failed to upload file for {website} - {category}")
|
103 |
+
else:
|
104 |
+
st.error(f"File not created for {website} - {category}")
|
105 |
+
|
106 |
+
st.success('Scraping Completed!')
|
config.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"hespress": {
|
3 |
+
"languages": {
|
4 |
+
"en": {
|
5 |
+
"politics": "https://en.hespress.com/politics",
|
6 |
+
"economy": "https://en.hespress.com/economy",
|
7 |
+
"society": "https://en.hespress.com/society",
|
8 |
+
"culture": "https://en.hespress.com/culture",
|
9 |
+
"sports": "https://en.hespress.com/sports",
|
10 |
+
"mena": "https://en.hespress.com/mena",
|
11 |
+
"international": "https://en.hespress.com/international"
|
12 |
+
},
|
13 |
+
"ar": {
|
14 |
+
"Politique": "https://www.hespress.com/politique",
|
15 |
+
"Economie": "https://www.hespress.com/economie",
|
16 |
+
"Tamazight": "https://www.hespress.com/tamazight",
|
17 |
+
"Sport": "https://www.hespress.com/sport",
|
18 |
+
"Sociètè": "https://www.hespress.com/societe",
|
19 |
+
"Culture": "https://www.hespress.com/art-et-culture",
|
20 |
+
"Mèdias": "https://www.hespress.com/medias",
|
21 |
+
"faits-divers": "https://www.hespress.com/faits-divers",
|
22 |
+
"Automoto": "https://www.hespress.com/automoto",
|
23 |
+
"Regions": "https://www.hespress.com/regions"
|
24 |
+
},
|
25 |
+
"fr": {
|
26 |
+
"Politique": "https://fr.hespress.com/politique",
|
27 |
+
"Economie": "https://fr.hespress.com/economie",
|
28 |
+
"Monde": "https://fr.hespress.com/monde",
|
29 |
+
"Sport": "https://fr.hespress.com/sport",
|
30 |
+
"Sociètè": "https://fr.hespress.com/societe",
|
31 |
+
"Culture": "https://fr.hespress.com/culture",
|
32 |
+
"Mèdias": "https://fr.hespress.com/media",
|
33 |
+
"High-tech": "https://fr.hespress.com/high-tech",
|
34 |
+
"Opinions": "https://fr.hespress.com/opinions",
|
35 |
+
"Regions": "https://fr.hespress.com/regions"
|
36 |
+
}
|
37 |
+
},
|
38 |
+
"module": "hespress"
|
39 |
+
},
|
40 |
+
"akhbarona": {
|
41 |
+
"languages": {
|
42 |
+
"ar": {},
|
43 |
+
"fr": {}
|
44 |
+
},
|
45 |
+
"module": "akhbarona"
|
46 |
+
},
|
47 |
+
"le360": {
|
48 |
+
"languages": {
|
49 |
+
"ar": {},
|
50 |
+
"fr": {}
|
51 |
+
},
|
52 |
+
"module": "le360"
|
53 |
+
}
|
54 |
+
}
|
economy_data_en.csv
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Title,Date,Category,Content,Link,Image
|
2 |
+
Forum Mines Rabat Entreprises fosters discussions on Morocco's industrial sovereignty,Thursday 8 February 2024 - 10:40,economy,"The 24th edition of “Forum Mines Rabat Entreprises,” hosted by the Forum Committee of Rabat’s National School of Mines on February 6-7 to, serving as an opportunity to discuss the key role of expertise in safeguarding Morocco’s industrial sovereignty.
|
3 |
+
Held under the theme of “The Kingdom’s New Industrial Strategy: Support for Innovation and Realization of Moroccan Sovereignty,” the forum brought together different experts from the industrial sector to highlight the importance of innovation and explore ways to strengthen Morocco’s industrial sovereignty.
|
4 |
+
Among them were Ayoub Daoudi, the General Manager of TE Connectivity North Africa, Laurent Figari, the General Manager of Safran Electronics & Defense, and Abderrazak Ben Saga, Head of the Information and Guidance Division and National Coordination of Career Centers.
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
In this regard, Daoudi highlighted the two trends and essential drivers that shape all technological evolutions: “all green and all connected.”
|
10 |
+
The notion of “all connected” signifies the rise in the presence of artificial intelligence and electronics, explained the expert, leading to a surge in the adoption of automation.
|
11 |
+
This connectivity implies that the level of reliability will play an increasingly vital role, he underscored.
|
12 |
+
This transformative shift holds inevitable implications for both the industrial sector and professionals, namely engineers and technicians of tomorrow.
|
13 |
+
“For this reason, the demand for a higher level of expertise will continue to grow,” underscored the General Manager of TE Connectivity North Africa.
|
14 |
+
Speaking of the two essential elements of industrial sovereignty, Daoudi highlighted that “while capital holds its significance, it is the expertise and know-how that play a truly essential role in ensuring industrial sovereignty.”
|
15 |
+
They are key elements for building and securing industrial sovereignty from one generation to the next, he stressed.
|
16 |
+
Daoudi emphasized the vital need to align our educational system with technological skills in today’s environment. He believes that expertise and technical proficiency are interdependent and inseparable.
|
17 |
+
“We no longer need engineers who know everything but master nothing. Instead, a combination of both profiles is necessary,” he stressed.
|
18 |
+
The General Manager of TE Connectivity North Africa also acknowledged the fact that Morocco places a high priority on developing knowledge and expertise in all sectors that are pivotal for achieving industrial sovereignty.
|
19 |
+
“This technological aspect will enable us to build our ecosystem and adapt to macroeconomic geopolitical changes worldwide,” noted Daoudi.
|
20 |
+
On the second day of the forum, the event also featured the participation of the Minister of Industry and Commerce, Ryad Mezzour. His address centered around the Kingdom’s industrial strategy, adding a political and strategic dimension to the forum.
|
21 |
+
The 24th edition of “Forum Mines Rabat Entreprises,” served as a platform to enhance interaction between students and the job market by building connections with various national and international companies.
|
22 |
+
This program provided an important opportunity for students from many professions to make a more effective transition into professional life.
|
23 |
+
It fostered an environment suitable for sharing knowledge and exploring internship and career opportunities, bridging the gap between academia and industry.",https://en.hespress.com/79478-forum-mines-rabat-entreprises-fosters-discussions-on-moroccos-industrial-sovereignty.html,https://e1.hespress.com/wp-content/uploads/2024/02/WhatsApp-Image-2024-02-08-at-20.23.17-scaled-e1707422272796-900x600.jpeg
|
file_id_mapping.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"/home/tajeddine/Desktop/news-scraper/faits-divers_data_ar.csv": "1Bl8U3cDG7dNHD4qmWI6oSwyBHo_6-ZEk", "/home/tajeddine/Desktop/news-scraper/art-et-culture_data_ar.csv": "1a_QbvGU04AQ2jGWTNUJ8140iK6oAtrrk", "C:\\Users\\Lenovo\\Documents\\MDS projects\\news-scraper_updated\\v4\\news-scraper\\economy_data_en.csv": null}
|
google_drive_handle.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from pydrive.auth import GoogleAuth
|
3 |
+
from pydrive.drive import GoogleDrive
|
4 |
+
from oauth2client.client import OAuth2Credentials
|
5 |
+
import os
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
CLIENT_ID = os.getenv('CLIENT_ID')
|
10 |
+
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
|
11 |
+
REFRESH_TOKEN = os.getenv('REFRESH_TOKEN')
|
12 |
+
REDIRECT_URI = os.getenv('REDIRECT_URIS').split(',')[0] # Access the first URI
|
13 |
+
|
14 |
+
def authenticate_google_drive():
|
15 |
+
gauth = GoogleAuth()
|
16 |
+
gauth.credentials = OAuth2Credentials(None, CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN, None,
|
17 |
+
"https://accounts.google.com/o/oauth2/token", None, "web")
|
18 |
+
drive = GoogleDrive(gauth)
|
19 |
+
return drive
|
20 |
+
|
21 |
+
drive = authenticate_google_drive()
|
22 |
+
|
23 |
+
def upload_file_to_drive(drive, file_path, folder_id=None):
|
24 |
+
if not os.path.exists(file_path):
|
25 |
+
print(f"Cannot upload, file does not exist at path: {file_path}")
|
26 |
+
return None
|
27 |
+
|
28 |
+
try:
|
29 |
+
file_metadata = {'title': os.path.basename(file_path)}
|
30 |
+
if folder_id:
|
31 |
+
file_metadata['parents'] = [{'id': folder_id}]
|
32 |
+
|
33 |
+
upload_file = drive.CreateFile(file_metadata)
|
34 |
+
|
35 |
+
# Check if the file already exists on Google Drive
|
36 |
+
existing_files = drive.ListFile({'q': f"title='{upload_file['title']}'"}).GetList()
|
37 |
+
if existing_files:
|
38 |
+
# File with the same name already exists, update the existing file
|
39 |
+
upload_file = existing_files[0]
|
40 |
+
print(f"File already exists on Drive. Updating file with ID: {upload_file['id']}")
|
41 |
+
else:
|
42 |
+
print("Uploading a new file to Drive.")
|
43 |
+
|
44 |
+
upload_file.SetContentFile(file_path)
|
45 |
+
upload_file.Upload()
|
46 |
+
print(f"File uploaded successfully. File ID: {upload_file['id']}")
|
47 |
+
return upload_file['id']
|
48 |
+
except Exception as e:
|
49 |
+
print(f"An error occurred during file upload: {e}")
|
50 |
+
return None
|
51 |
+
|
52 |
+
|
53 |
+
def get_drive_download_link(drive, file_id):
|
54 |
+
try:
|
55 |
+
file = drive.CreateFile({'id': file_id})
|
56 |
+
file.Upload() # Make sure the file exists on Drive
|
57 |
+
file.InsertPermission({
|
58 |
+
'type': 'anyone',
|
59 |
+
'value': 'anyone',
|
60 |
+
'role': 'reader'})
|
61 |
+
return "https://drive.google.com/uc?export=download&id=" + file_id
|
62 |
+
except Exception as e:
|
63 |
+
print(f"Error fetching download link: {e}")
|
64 |
+
return None
|
hespress_ar.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
3 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
4 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.common.by import By
|
7 |
+
from selenium.common.exceptions import TimeoutException
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import time
|
10 |
+
import re
|
11 |
+
import os
|
12 |
+
import requests
|
13 |
+
import csv
|
14 |
+
from urllib.parse import urljoin
|
15 |
+
from google_drive_handle import authenticate_google_drive
|
16 |
+
drive = authenticate_google_drive()
|
17 |
+
|
18 |
+
|
19 |
+
# Set up Chrome WebDriver with options
|
20 |
+
options = ChromeOptions()
|
21 |
+
options.add_argument('--headless')
|
22 |
+
options.add_argument('--no-sandbox')
|
23 |
+
options.add_argument('--disable-dev-shm-usage')
|
24 |
+
|
25 |
+
# Initialize the Chrome WebDriver
|
26 |
+
wd = webdriver.Chrome(options=options)
|
27 |
+
|
28 |
+
def download_image(img_url):
|
29 |
+
return img_url
|
30 |
+
|
31 |
+
def scroll_page(expected_article_count):
|
32 |
+
scroll_pause_time = 2
|
33 |
+
screen_height = wd.execute_script("return window.innerHeight;")
|
34 |
+
scrolled_height = 0
|
35 |
+
|
36 |
+
while True:
|
37 |
+
scrolled_height += screen_height
|
38 |
+
wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
|
39 |
+
time.sleep(scroll_pause_time)
|
40 |
+
new_height = wd.execute_script("return document.body.scrollHeight")
|
41 |
+
if scrolled_height >= new_height:
|
42 |
+
break
|
43 |
+
|
44 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
45 |
+
articles = soup.find_all('div', class_='overlay card')
|
46 |
+
if len(articles) >= expected_article_count:
|
47 |
+
break
|
48 |
+
|
49 |
+
def scrape_article_details(article_url):
|
50 |
+
try:
|
51 |
+
wd.get(article_url)
|
52 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
|
53 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
54 |
+
content_tag = soup.find('div', class_='article-content')
|
55 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
56 |
+
date_tag = soup.find('small', class_='text-muted time')
|
57 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
58 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
59 |
+
image_url = image_tag['src'] if image_tag else None
|
60 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
61 |
+
return content, date, img_url
|
62 |
+
except TimeoutException:
|
63 |
+
print("Timed out waiting for page elements to load")
|
64 |
+
return "", "", None
|
65 |
+
except Exception as e:
|
66 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
67 |
+
return "", "", None
|
68 |
+
|
69 |
+
def scrape_article_details(article_url):
|
70 |
+
|
71 |
+
try:
|
72 |
+
wd.get(article_url)
|
73 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
|
74 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
75 |
+
|
76 |
+
content_tag = soup.find('div', class_='article-content')
|
77 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
78 |
+
|
79 |
+
date_tag = soup.find('small', class_='text-muted time')
|
80 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
81 |
+
|
82 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
83 |
+
image_url = image_tag['src'] if image_tag else None
|
84 |
+
|
85 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
86 |
+
|
87 |
+
return content, date, img_url
|
88 |
+
|
89 |
+
except TimeoutException:
|
90 |
+
print("Timed out waiting for page elements to load")
|
91 |
+
return "", "", None, ""
|
92 |
+
except Exception as e:
|
93 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
94 |
+
return "", "", None, ""
|
95 |
+
|
96 |
+
def sanitize_filename(filename):
|
97 |
+
return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
|
98 |
+
|
99 |
+
def scrape_category(category_url, num_articles):
|
100 |
+
print("Attempting to scrape:", category_url)
|
101 |
+
articles_data = []
|
102 |
+
wd.get(category_url)
|
103 |
+
scroll_page(num_articles)
|
104 |
+
|
105 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
106 |
+
articles = soup.find_all('div', class_='overlay card')
|
107 |
+
for article in articles[:num_articles]:
|
108 |
+
link_tag = article.find('a', class_='stretched-link')
|
109 |
+
link = link_tag['href'] if link_tag else ""
|
110 |
+
title_tag = article.find('h3', class_='card-title')
|
111 |
+
title = title_tag.get_text().strip() if title_tag else ""
|
112 |
+
content, date, img_url = scrape_article_details(link)
|
113 |
+
article_data = {
|
114 |
+
"Title": title,
|
115 |
+
"Date": date,
|
116 |
+
"Category": category_url.split('/')[-1],
|
117 |
+
"Content": content,
|
118 |
+
"Link": link,
|
119 |
+
"Image": img_url
|
120 |
+
}
|
121 |
+
print(f"Scraping article: {title}, Link: {link}")
|
122 |
+
articles_data.append(article_data)
|
123 |
+
|
124 |
+
|
125 |
+
# Save scraped data to a CSV file
|
126 |
+
category_name = sanitize_filename(category_url.split("/")[-1])
|
127 |
+
csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_ar.csv')
|
128 |
+
file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
|
129 |
+
|
130 |
+
try:
|
131 |
+
with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
|
132 |
+
fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
|
133 |
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
134 |
+
if file_mode == 'w':
|
135 |
+
writer.writeheader()
|
136 |
+
for article in articles_data:
|
137 |
+
writer.writerow(article)
|
138 |
+
print(f"CSV file saved successfully at {csv_file_path}")
|
139 |
+
except IOError as e:
|
140 |
+
print(f"Failed to save file at {csv_file_path}: {e}")
|
141 |
+
return None # Return None to indicate failure
|
142 |
+
|
143 |
+
# Check if the file exists before uploading
|
144 |
+
|
145 |
+
if os.path.exists(csv_file_path):
|
146 |
+
print(f"File successfully created at {csv_file_path}")
|
147 |
+
return csv_file_path
|
148 |
+
|
149 |
+
else:
|
150 |
+
print(f"Failed to create file for {category_url}")
|
151 |
+
return None
|
hespress_en.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
3 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
4 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.common.by import By
|
7 |
+
from selenium.common.exceptions import TimeoutException
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import time
|
10 |
+
import re
|
11 |
+
import os
|
12 |
+
import requests
|
13 |
+
import csv
|
14 |
+
from urllib.parse import urljoin
|
15 |
+
from google_drive_handle import authenticate_google_drive
|
16 |
+
drive = authenticate_google_drive()
|
17 |
+
|
18 |
+
|
19 |
+
# Set up Chrome WebDriver with options
|
20 |
+
options = ChromeOptions()
|
21 |
+
options.add_argument('--headless')
|
22 |
+
options.add_argument('--no-sandbox')
|
23 |
+
options.add_argument('--disable-dev-shm-usage')
|
24 |
+
|
25 |
+
|
26 |
+
wd = webdriver.Chrome(options=options)
|
27 |
+
|
28 |
+
|
29 |
+
def download_image(img_url):
|
30 |
+
return img_url
|
31 |
+
|
32 |
+
def scroll_page(expected_article_count):
|
33 |
+
scroll_pause_time = 2
|
34 |
+
screen_height = wd.execute_script("return window.innerHeight;")
|
35 |
+
scrolled_height = 0
|
36 |
+
|
37 |
+
while True:
|
38 |
+
scrolled_height += screen_height
|
39 |
+
wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
|
40 |
+
time.sleep(scroll_pause_time)
|
41 |
+
new_height = wd.execute_script("return document.body.scrollHeight")
|
42 |
+
if scrolled_height >= new_height:
|
43 |
+
break
|
44 |
+
|
45 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
46 |
+
articles = soup.find_all('div', class_='overlay card')
|
47 |
+
if len(articles) >= expected_article_count:
|
48 |
+
break
|
49 |
+
|
50 |
+
def scrape_article_details(article_url):
|
51 |
+
try:
|
52 |
+
wd.get(article_url)
|
53 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
|
54 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
55 |
+
content_tag = soup.find('div', class_='article-content')
|
56 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
57 |
+
date_tag = soup.find('small', class_='text-muted time')
|
58 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
59 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
60 |
+
image_url = image_tag['src'] if image_tag else None
|
61 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
62 |
+
return content, date, img_url
|
63 |
+
except TimeoutException:
|
64 |
+
print("Timed out waiting for page elements to load")
|
65 |
+
return "", "", None
|
66 |
+
except Exception as e:
|
67 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
68 |
+
return "", "", None
|
69 |
+
|
70 |
+
def sanitize_filename(filename):
|
71 |
+
return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
|
72 |
+
|
73 |
+
def scrape_category(category_url, num_articles):
|
74 |
+
print("Attempting to scrape:", category_url)
|
75 |
+
articles_data = []
|
76 |
+
wd.get(category_url)
|
77 |
+
scroll_page(num_articles)
|
78 |
+
|
79 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
80 |
+
articles = soup.find_all('div', class_='overlay card')
|
81 |
+
for article in articles[:num_articles]:
|
82 |
+
link_tag = article.find('a', class_='stretched-link')
|
83 |
+
link = link_tag['href'] if link_tag else ""
|
84 |
+
title_tag = article.find('h3', class_='card-title')
|
85 |
+
title = title_tag.get_text().strip() if title_tag else ""
|
86 |
+
content, date, img_url = scrape_article_details(link)
|
87 |
+
article_data = {
|
88 |
+
"Title": title,
|
89 |
+
"Date": date,
|
90 |
+
"Category": category_url.split('/')[-1],
|
91 |
+
"Content": content,
|
92 |
+
"Link": link,
|
93 |
+
"Image": img_url
|
94 |
+
}
|
95 |
+
print(f"Scraping article: {title}, Link: {link}")
|
96 |
+
articles_data.append(article_data)
|
97 |
+
|
98 |
+
# Save scraped data to a CSV file
|
99 |
+
category_name = sanitize_filename(category_url.split("/")[-1])
|
100 |
+
csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
|
101 |
+
file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
|
102 |
+
|
103 |
+
try:
|
104 |
+
with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
|
105 |
+
fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
|
106 |
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
107 |
+
if file_mode == 'w':
|
108 |
+
writer.writeheader()
|
109 |
+
for article in articles_data:
|
110 |
+
writer.writerow(article)
|
111 |
+
print(f"CSV file saved successfully at {csv_file_path}")
|
112 |
+
except IOError as e:
|
113 |
+
print(f"Failed to save file at {csv_file_path}: {e}")
|
114 |
+
return None # Return None to indicate failure
|
115 |
+
|
116 |
+
# Check if the file exists before uploading
|
117 |
+
|
118 |
+
if os.path.exists(csv_file_path):
|
119 |
+
print(f"File successfully created at {csv_file_path}")
|
120 |
+
return csv_file_path
|
121 |
+
|
122 |
+
else:
|
123 |
+
print(f"Failed to create file for {category_url}")
|
124 |
+
return None
|
hespress_fr.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
3 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
4 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.common.by import By
|
7 |
+
from selenium.common.exceptions import TimeoutException
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import time
|
10 |
+
import re
|
11 |
+
import os
|
12 |
+
import requests
|
13 |
+
import csv
|
14 |
+
from urllib.parse import urljoin
|
15 |
+
from google_drive_handle import authenticate_google_drive
|
16 |
+
drive = authenticate_google_drive()
|
17 |
+
|
18 |
+
|
19 |
+
# Set up Chrome WebDriver with options
|
20 |
+
options = ChromeOptions()
|
21 |
+
options.add_argument('--headless')
|
22 |
+
options.add_argument('--no-sandbox')
|
23 |
+
options.add_argument('--disable-dev-shm-usage')
|
24 |
+
|
25 |
+
# Initialize the Chrome WebDriver
|
26 |
+
wd = webdriver.Chrome(options=options)
|
27 |
+
|
28 |
+
def download_image(img_url):
|
29 |
+
return img_url
|
30 |
+
|
31 |
+
def scroll_page(expected_article_count):
|
32 |
+
scroll_pause_time = 2
|
33 |
+
screen_height = wd.execute_script("return window.innerHeight;")
|
34 |
+
scrolled_height = 0
|
35 |
+
|
36 |
+
while True:
|
37 |
+
scrolled_height += screen_height
|
38 |
+
wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
|
39 |
+
time.sleep(scroll_pause_time)
|
40 |
+
new_height = wd.execute_script("return document.body.scrollHeight")
|
41 |
+
if scrolled_height >= new_height:
|
42 |
+
break
|
43 |
+
|
44 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
45 |
+
articles = soup.find_all('div', class_='overlay card')
|
46 |
+
if len(articles) >= expected_article_count:
|
47 |
+
break
|
48 |
+
|
49 |
+
def scrape_article_details(article_url):
|
50 |
+
try:
|
51 |
+
wd.get(article_url)
|
52 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
|
53 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
54 |
+
content_tag = soup.find('div', class_='article-content')
|
55 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
56 |
+
date_tag = soup.find('small', class_='text-muted time')
|
57 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
58 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
59 |
+
image_url = image_tag['src'] if image_tag else None
|
60 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
61 |
+
return content, date, img_url
|
62 |
+
except TimeoutException:
|
63 |
+
print("Timed out waiting for page elements to load")
|
64 |
+
return "", "", None
|
65 |
+
except Exception as e:
|
66 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
67 |
+
return "", "", None
|
68 |
+
|
69 |
+
def scrape_article_details(article_url):
|
70 |
+
|
71 |
+
try:
|
72 |
+
wd.get(article_url)
|
73 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
|
74 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
75 |
+
|
76 |
+
content_tag = soup.find('div', class_='article-content')
|
77 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
78 |
+
|
79 |
+
date_tag = soup.find('small', class_='text-muted time')
|
80 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
81 |
+
|
82 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
83 |
+
image_url = image_tag['src'] if image_tag else None
|
84 |
+
|
85 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
86 |
+
|
87 |
+
return content, date, img_url
|
88 |
+
|
89 |
+
except TimeoutException:
|
90 |
+
print("Timed out waiting for page elements to load")
|
91 |
+
return "", "", None, ""
|
92 |
+
except Exception as e:
|
93 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
94 |
+
return "", "", None, ""
|
95 |
+
|
96 |
+
def sanitize_filename(filename):
|
97 |
+
return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
|
98 |
+
|
99 |
+
def scrape_category(category_url, num_articles):
|
100 |
+
print("Attempting to scrape:", category_url)
|
101 |
+
articles_data = []
|
102 |
+
wd.get(category_url)
|
103 |
+
scroll_page(num_articles)
|
104 |
+
|
105 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
106 |
+
articles = soup.find_all('div', class_='overlay card')
|
107 |
+
for article in articles[:num_articles]:
|
108 |
+
link_tag = article.find('a', class_='stretched-link')
|
109 |
+
link = link_tag['href'] if link_tag else ""
|
110 |
+
title_tag = article.find('h3', class_='card-title')
|
111 |
+
title = title_tag.get_text().strip() if title_tag else ""
|
112 |
+
content, date, img_url = scrape_article_details(link)
|
113 |
+
article_data = {
|
114 |
+
"Title": title,
|
115 |
+
"Date": date,
|
116 |
+
"Category": category_url.split('/')[-1],
|
117 |
+
"Content": content,
|
118 |
+
"Link": link,
|
119 |
+
"Image": img_url
|
120 |
+
}
|
121 |
+
print(f"Scraping article: {title}, Link: {link}")
|
122 |
+
articles_data.append(article_data)
|
123 |
+
|
124 |
+
|
125 |
+
# Save scraped data to a CSV file
|
126 |
+
category_name = sanitize_filename(category_url.split("/")[-1])
|
127 |
+
csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_fr.csv')
|
128 |
+
file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
|
129 |
+
|
130 |
+
try:
|
131 |
+
with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
|
132 |
+
fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
|
133 |
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
134 |
+
if file_mode == 'w':
|
135 |
+
writer.writeheader()
|
136 |
+
for article in articles_data:
|
137 |
+
writer.writerow(article)
|
138 |
+
print(f"CSV file saved successfully at {csv_file_path}")
|
139 |
+
except IOError as e:
|
140 |
+
print(f"Failed to save file at {csv_file_path}: {e}")
|
141 |
+
return None # Return None to indicate failure
|
142 |
+
|
143 |
+
# Check if the file exists before uploading
|
144 |
+
|
145 |
+
if os.path.exists(csv_file_path):
|
146 |
+
print(f"File successfully created at {csv_file_path}")
|
147 |
+
return csv_file_path
|
148 |
+
|
149 |
+
else:
|
150 |
+
print(f"Failed to create file for {category_url}")
|
151 |
+
return None
|