|
|
|
from bs4 import BeautifulSoup |
|
import requests |
|
import pandas as pd |
|
import time |
|
import timeit |
|
|
|
|
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' |
|
} |
|
|
|
def faire_requete(url): |
|
""" |
|
Effectuer une requête HTTP avec gestion des erreurs |
|
Args: |
|
url (str): l'URL de la requête HTTP |
|
|
|
Returns: |
|
bytes or None: Le contenu de la réponse si la requête est réussie, sinon None. |
|
""" |
|
try: |
|
with requests.get(url, headers=headers) as reponse: |
|
reponse.raise_for_status() |
|
return reponse.content |
|
except requests.RequestException as e: |
|
print(f"Erreur de requête HTTP: {e}") |
|
return None |
|
|
|
def extract_articles(category_url, num_articles): |
|
temps_debut = timeit.default_timer() |
|
liens_articles = [] |
|
current_count = 0 |
|
|
|
while current_count < num_articles: |
|
time.sleep(2) |
|
contenu = faire_requete(category_url + f"?start={current_count}&order=") |
|
|
|
if contenu: |
|
soup = BeautifulSoup(contenu, "html.parser") |
|
liens = soup.find_all("h3", {"class":"titre_article"}) |
|
for lien in liens: |
|
if current_count >= num_articles: |
|
break |
|
liens_articles.append("https://www.libe.ma" + lien.a["href"]) |
|
current_count += 1 |
|
|
|
lignes = [] |
|
for lien in liens_articles: |
|
time.sleep(2) |
|
contenu = faire_requete(lien) |
|
if contenu: |
|
soup = BeautifulSoup(contenu, "html.parser") |
|
try: |
|
titre = soup.find("h1", {"class":"access"}).text.replace("\n", "").strip() |
|
except: |
|
titre = None |
|
try: |
|
description = soup.find("div", {"class":"access firstletter"}).text.replace("\n", "").strip() |
|
except: |
|
description = None |
|
try: |
|
date = soup.find("div", {"class":"date"}).text.replace("\n", "").strip() |
|
except: |
|
date = None |
|
lignes.append([titre, description, date]) |
|
|
|
return lignes |
|
|
|
def scrape_category(category_url, num_articles): |
|
article_data = extract_articles(category_url, num_articles) |
|
|
|
colonnes = ["titre", "content", "date"] |
|
articles_df = pd.DataFrame(article_data, columns=colonnes) |
|
|
|
csv_file_path = "liberation_art.csv" |
|
articles_df.to_csv(csv_file_path, index=False) |
|
|
|
return csv_file_path |
|
''' |
|
if __name__ == "__main__": |
|
category_url = "https://www.libe.ma/Economie_r10.html" |
|
num_articles = 10 # Number of articles to scrape |
|
csv_file_path = scrape_category(category_url, num_articles) |
|
# Now, csv_file_path can be used in Streamlit for uploading |
|
''' |