Ribot commited on
Commit
eeae908
·
verified ·
1 Parent(s): 60821e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -70
app.py CHANGED
@@ -1,93 +1,84 @@
 
1
  import requests
 
2
  import re
3
  import os
4
- import zipfile
5
- import tempfile
6
- import gradio as gr
7
- from pathlib import Path
8
- from bs4 import BeautifulSoup
9
 
10
- def sanitize_filename(name):
11
- return re.sub(r'[\\/*?:"<>|]', "", name).strip().replace(" ", "_")[:100]
 
 
12
 
13
- def extract_episode_links(html_text, base_url):
14
- soup = BeautifulSoup(html_text, "html.parser")
 
 
 
 
15
  episodes = []
16
- seen_urls = set()
 
 
 
 
 
 
17
 
18
- # Analyse de chaque bloc d'épisode
19
- for audio_tag in soup.find_all("audio"):
20
- source = audio_tag.find("source")
21
- if source and source.get("src", "").endswith(".mp3"):
22
- title = (
23
- audio_tag.get("aria-label")
24
- or audio_tag.get("title")
25
- or source.get("title")
26
- or "episode"
27
- )
28
- url = source["src"]
29
- if not url.startswith("http"):
30
- url = requests.compat.urljoin(base_url, url)
31
 
32
- if url not in seen_urls:
33
- seen_urls.add(url)
34
- episodes.append((title, url))
35
 
36
  return episodes
37
 
38
- def download_podcast_series(url):
39
- try:
40
- r = requests.get(url, timeout=10)
41
- r.raise_for_status()
42
- except Exception as e:
43
- return f"Erreur lors du chargement de la page : {e}", None
44
-
45
- html_text = r.text
46
- episodes = extract_episode_links(html_text, url)
47
 
48
  if not episodes:
49
- return "Aucun épisode audio trouvé sur la page.", None
50
-
51
- with tempfile.TemporaryDirectory() as temp_dir:
52
- zip_path = os.path.join(temp_dir, "podcast.zip")
53
- used_filenames = set()
54
 
55
- with zipfile.ZipFile(zip_path, "w") as zipf:
56
- for idx, (title, mp3_url) in enumerate(episodes, 1):
57
- base_name = f"{idx:02d}-" + sanitize_filename(title)
58
- filename = base_name + ".mp3"
59
 
60
- # Éviter les doublons de nom
61
- count = 1
62
- while filename in used_filenames:
63
- filename = f"{base_name}_{count}.mp3"
64
- count += 1
65
- used_filenames.add(filename)
66
 
67
- try:
68
- audio = requests.get(mp3_url, stream=True, timeout=15)
69
- audio.raise_for_status()
70
- temp_mp3_path = os.path.join(temp_dir, filename)
71
- with open(temp_mp3_path, "wb") as f:
72
- for chunk in audio.iter_content(8192):
73
- f.write(chunk)
74
- zipf.write(temp_mp3_path, arcname=filename)
75
- except Exception as e:
76
- print(f"Erreur lors du téléchargement de {mp3_url} : {e}")
 
77
 
78
- return "Téléchargement terminé !", zip_path
79
 
80
- interface = gr.Interface(
81
- fn=download_podcast_series,
82
- inputs=gr.Textbox(label="URL du podcast radio (ex: France Culture)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/serie-le-capitaine-fracasse-de-theophile-gautier"),
83
  outputs=[
84
- gr.Textbox(label="Statut"),
85
- gr.File(label="Fichier ZIP des épisodes")
86
  ],
87
- title="Téléchargeur de Podcast Radio (.mp3)",
88
- description="Collez un lien vers une série de podcast Radio France (ex: France Culture). Seuls les fichiers .mp3 correspondant aux épisodes seront extraits et regroupés dans un fichier ZIP téléchargeable.",
89
- allow_flagging="never"
90
  )
91
 
92
  if __name__ == "__main__":
93
- interface.launch()
 
1
+ import gradio as gr
2
  import requests
3
+ from bs4 import BeautifulSoup
4
  import re
5
  import os
6
+ from urllib.parse import urlparse
7
+ from zipfile import ZipFile
 
 
 
8
 
9
+ def extract_podcast_episodes(url):
10
+ headers = {"User-Agent": "Mozilla/5.0"}
11
+ response = requests.get(url, headers=headers)
12
+ response.raise_for_status()
13
 
14
+ soup = BeautifulSoup(response.text, "html.parser")
15
+
16
+ # Titre principal du podcast (pour filtrer les titres)
17
+ main_title = soup.find("h1").get_text(strip=True).lower()
18
+
19
+ # Section contenant les épisodes listés explicitement
20
  episodes = []
21
+ for article in soup.select("article"):
22
+ title_tag = article.find("h3")
23
+ if not title_tag:
24
+ continue
25
+ title = title_tag.get_text(strip=True)
26
+ if not any(word in title.lower() for word in main_title.split()):
27
+ continue
28
 
29
+ # Cherche lien MP3 directement dans l'article
30
+ mp3_url = None
31
+ for a in article.find_all("a", href=True):
32
+ if a["href"].endswith(".mp3"):
33
+ mp3_url = a["href"]
34
+ break
 
 
 
 
 
 
 
35
 
36
+ if mp3_url:
37
+ episodes.append({"title": title, "url": mp3_url})
 
38
 
39
  return episodes
40
 
41
+ def download_and_zip_episodes(url):
42
+ episodes = extract_podcast_episodes(url)
 
 
 
 
 
 
 
43
 
44
  if not episodes:
45
+ return "Aucun épisode valide trouvé", None
 
 
 
 
46
 
47
+ os.makedirs("downloads", exist_ok=True)
48
+ zip_filename = "episodes_radiofrance.zip"
49
+ zip_path = os.path.join("downloads", zip_filename)
 
50
 
51
+ with ZipFile(zip_path, "w") as zipf:
52
+ for i, episode in enumerate(episodes, start=1):
53
+ mp3_url = episode["url"]
54
+ title = episode["title"]
55
+ ext = os.path.splitext(urlparse(mp3_url).path)[1]
56
+ filename = f"{i:02d} - {title}{ext}".replace("/", "_")
57
 
58
+ try:
59
+ mp3_response = requests.get(mp3_url, stream=True)
60
+ mp3_response.raise_for_status()
61
+ local_path = os.path.join("downloads", filename)
62
+ with open(local_path, "wb") as f:
63
+ for chunk in mp3_response.iter_content(1024):
64
+ f.write(chunk)
65
+ zipf.write(local_path, arcname=filename)
66
+ os.remove(local_path)
67
+ except Exception as e:
68
+ print(f"Erreur lors du téléchargement de {mp3_url}: {e}")
69
 
70
+ return f"{len(episodes)} épisode(s) téléchargé(s)", zip_path
71
 
72
+ iface = gr.Interface(
73
+ fn=download_and_zip_episodes,
74
+ inputs=gr.Textbox(label="URL de la page podcast de France Culture"),
75
  outputs=[
76
+ gr.Text(label="Résultat"),
77
+ gr.File(label="Fichier ZIP")
78
  ],
79
+ title="Téléchargeur de Podcast France Culture",
80
+ description="Saisissez l’URL d’une série sur France Culture (ex: https://www.radiofrance.fr/franceculture/podcasts/...) pour télécharger uniquement les bons épisodes listés sur la page."
 
81
  )
82
 
83
  if __name__ == "__main__":
84
+ iface.launch()