Ribot commited on
Commit
6ca2249
·
verified ·
1 Parent(s): eeae908

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -68
app.py CHANGED
@@ -3,82 +3,70 @@ import requests
3
  from bs4 import BeautifulSoup
4
  import re
5
  import os
6
- from urllib.parse import urlparse
7
- from zipfile import ZipFile
 
8
 
9
- def extract_podcast_episodes(url):
10
- headers = {"User-Agent": "Mozilla/5.0"}
11
- response = requests.get(url, headers=headers)
12
- response.raise_for_status()
 
 
 
 
 
13
 
14
- soup = BeautifulSoup(response.text, "html.parser")
 
 
 
 
 
 
 
 
 
15
 
16
- # Titre principal du podcast (pour filtrer les titres)
17
- main_title = soup.find("h1").get_text(strip=True).lower()
18
-
19
- # Section contenant les épisodes listés explicitement
20
- episodes = []
21
- for article in soup.select("article"):
22
- title_tag = article.find("h3")
23
- if not title_tag:
24
- continue
25
- title = title_tag.get_text(strip=True)
26
- if not any(word in title.lower() for word in main_title.split()):
27
- continue
28
-
29
- # Cherche lien MP3 directement dans l'article
30
- mp3_url = None
31
- for a in article.find_all("a", href=True):
32
- if a["href"].endswith(".mp3"):
33
- mp3_url = a["href"]
34
- break
35
-
36
- if mp3_url:
37
- episodes.append({"title": title, "url": mp3_url})
38
-
39
- return episodes
40
 
41
- def download_and_zip_episodes(url):
42
- episodes = extract_podcast_episodes(url)
43
 
44
- if not episodes:
45
- return "Aucun épisode valide trouvé", None
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- os.makedirs("downloads", exist_ok=True)
48
- zip_filename = "episodes_radiofrance.zip"
49
- zip_path = os.path.join("downloads", zip_filename)
50
 
51
- with ZipFile(zip_path, "w") as zipf:
52
- for i, episode in enumerate(episodes, start=1):
53
- mp3_url = episode["url"]
54
- title = episode["title"]
55
- ext = os.path.splitext(urlparse(mp3_url).path)[1]
56
- filename = f"{i:02d} - {title}{ext}".replace("/", "_")
57
 
58
- try:
59
- mp3_response = requests.get(mp3_url, stream=True)
60
- mp3_response.raise_for_status()
61
- local_path = os.path.join("downloads", filename)
62
- with open(local_path, "wb") as f:
63
- for chunk in mp3_response.iter_content(1024):
64
- f.write(chunk)
65
- zipf.write(local_path, arcname=filename)
66
- os.remove(local_path)
67
- except Exception as e:
68
- print(f"Erreur lors du téléchargement de {mp3_url}: {e}")
69
 
70
- return f"{len(episodes)} épisode(s) téléchargé(s)", zip_path
 
 
 
 
 
 
71
 
72
- iface = gr.Interface(
73
- fn=download_and_zip_episodes,
74
- inputs=gr.Textbox(label="URL de la page podcast de France Culture"),
75
- outputs=[
76
- gr.Text(label="Résultat"),
77
- gr.File(label="Fichier ZIP")
78
- ],
79
- title="Téléchargeur de Podcast France Culture",
80
- description="Saisissez l’URL d’une série sur France Culture (ex: https://www.radiofrance.fr/franceculture/podcasts/...) pour télécharger uniquement les bons épisodes listés sur la page."
81
- )
82
 
83
- if __name__ == "__main__":
84
- iface.launch()
 
3
  from bs4 import BeautifulSoup
4
  import re
5
  import os
6
+ import zipfile
7
+ from urllib.parse import urljoin
8
+ from pathlib import Path
9
 
10
+ def extract_mp3_links(url):
11
+ response = requests.get(url)
12
+ soup = BeautifulSoup(response.content, "html.parser")
13
+
14
+ # Extraire les blocs d’épisodes depuis le HTML
15
+ episode_blocks = soup.find_all("a", href=True)
16
+
17
+ seen = set()
18
+ links = []
19
 
20
+ for a in episode_blocks:
21
+ href = a['href']
22
+ if href.endswith('.mp3') and 'radiofrance' in href:
23
+ full_url = href if href.startswith("http") else urljoin(url, href)
24
+ if full_url not in seen:
25
+ seen.add(full_url)
26
+ title = a.get("title") or a.text.strip() or "episode"
27
+ links.append((full_url, title))
28
+
29
+ return links
30
 
31
+ def download_episodes(podcast_url):
32
+ os.makedirs("downloads", exist_ok=True)
33
+ mp3_links = extract_mp3_links(podcast_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ if not mp3_links:
36
+ return None, "Aucun épisode valide trouvé."
37
 
38
+ valid_episodes = []
39
+ for idx, (mp3_url, title) in enumerate(mp3_links, 1):
40
+ try:
41
+ response = requests.get(mp3_url)
42
+ if response.status_code == 200:
43
+ safe_title = re.sub(r'[^\w\d-]', '_', title)[:80]
44
+ filename = f"{idx:02d}_{safe_title}.mp3"
45
+ filepath = os.path.join("downloads", filename)
46
+ with open(filepath, "wb") as f:
47
+ f.write(response.content)
48
+ valid_episodes.append(filepath)
49
+ except Exception as e:
50
+ print(f"Erreur avec {mp3_url} : {e}")
51
 
52
+ if not valid_episodes:
53
+ return None, "Aucun fichier mp3 téléchargé."
 
54
 
55
+ zip_path = "/tmp/episodes_radiofrance.zip"
56
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
57
+ for file in valid_episodes:
58
+ zipf.write(file, arcname=os.path.basename(file))
 
 
59
 
60
+ return zip_path, f"{len(valid_episodes)} épisode(s) téléchargé(s) avec succès."
 
 
 
 
 
 
 
 
 
 
61
 
62
+ with gr.Blocks() as app:
63
+ gr.Markdown("# 🎧 Téléchargeur de Podcasts Radio France")
64
+ with gr.Row():
65
+ url_input = gr.Text(label="URL de la série du podcast")
66
+ launch_btn = gr.Button("Télécharger les épisodes")
67
+ output_file = gr.File(label="Fichier ZIP")
68
+ output_message = gr.Textbox(label="Statut")
69
 
70
+ launch_btn.click(fn=download_episodes, inputs=url_input, outputs=[output_file, output_message])
 
 
 
 
 
 
 
 
 
71
 
72
+ app.launch()