A8-Moroccan-News-Aggregator / liberation_fr.py
S-11's picture
Upload 13 files
e45d093 verified
# Import required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import timeit
# Headers for simulating a browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
def faire_requete(url):
"""
Effectuer une requête HTTP avec gestion des erreurs
Args:
url (str): l'URL de la requête HTTP
Returns:
bytes or None: Le contenu de la réponse si la requête est réussie, sinon None.
"""
try:
with requests.get(url, headers=headers) as reponse:
reponse.raise_for_status()
return reponse.content
except requests.RequestException as e:
print(f"Erreur de requête HTTP: {e}")
return None
def extract_articles(category_url, num_articles):
temps_debut = timeit.default_timer()
liens_articles = []
current_count = 0
while current_count < num_articles:
time.sleep(2)
contenu = faire_requete(category_url + f"?start={current_count}&order=")
if contenu:
soup = BeautifulSoup(contenu, "html.parser")
liens = soup.find_all("h3", {"class":"titre_article"})
for lien in liens:
if current_count >= num_articles:
break
liens_articles.append("https://www.libe.ma" + lien.a["href"])
current_count += 1
lignes = []
for lien in liens_articles:
time.sleep(2)
contenu = faire_requete(lien)
if contenu:
soup = BeautifulSoup(contenu, "html.parser")
try:
titre = soup.find("h1", {"class":"access"}).text.replace("\n", "").strip()
except:
titre = None
try:
description = soup.find("div", {"class":"access firstletter"}).text.replace("\n", "").strip()
except:
description = None
try:
date = soup.find("div", {"class":"date"}).text.replace("\n", "").strip()
except:
date = None
lignes.append([titre, description, date])
return lignes
def scrape_category(category_url, num_articles):
article_data = extract_articles(category_url, num_articles)
colonnes = ["titre", "content", "date"]
articles_df = pd.DataFrame(article_data, columns=colonnes)
csv_file_path = "liberation_art.csv"
articles_df.to_csv(csv_file_path, index=False)
return csv_file_path
'''
if __name__ == "__main__":
category_url = "https://www.libe.ma/Economie_r10.html"
num_articles = 10 # Number of articles to scrape
csv_file_path = scrape_category(category_url, num_articles)
# Now, csv_file_path can be used in Streamlit for uploading
'''