Ribot commited on
Commit
734bffb
·
verified ·
1 Parent(s): 96682d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -107
app.py CHANGED
@@ -1,113 +1,93 @@
1
- import subprocess
2
- import sys
 
 
 
 
 
 
3
 
4
- # Installation automatique des dépendances
5
- def install(package):
6
- subprocess.check_call([sys.executable, "-m", "pip", "install", package])
7
-
8
- try:
9
- import gradio as gr
10
- import requests
11
- import re
12
- import os
13
- import zipfile
14
- from pathlib import Path
15
- except ImportError:
16
- install("gradio")
17
- install("requests")
18
- import gradio as gr
19
- import requests
20
- import re
21
- import os
22
- import zipfile
23
- from pathlib import Path
24
-
25
- # Nettoyage du nom de fichier
26
  def sanitize_filename(name):
27
- name = re.sub(r'[\\/*?:"<>|]', "", name)
28
- return name.strip().replace(" ", "_")[:100]
29
-
30
- # Extraction des liens MP3 + titres depuis HTML
31
- def extract_mp3_links_and_titles(html_text):
32
- mp3_regex = re.compile(r'https://[^\s"]+?\.mp3')
33
- title_regex = re.compile(r'title:\\"([^\\"]+)\\"')
34
-
35
- urls = mp3_regex.findall(html_text)
36
- titles = title_regex.findall(html_text)
37
-
38
- # Supprimer les doublons d'URL tout en gardant l'ordre
39
- seen = set()
40
- unique_urls = []
41
- for u in urls:
42
- if u not in seen:
43
- seen.add(u)
44
- unique_urls.append(u)
45
-
46
- # Compléter les titres manquants
47
- titles += [""] * (len(unique_urls) - len(titles))
48
- return list(zip(unique_urls, titles[:len(unique_urls)]))
49
-
50
- # Fonction principale
51
- def download_podcasts(url):
 
 
 
52
  try:
53
- response = requests.get(url)
54
- response.raise_for_status()
55
  except Exception as e:
56
- return f"Erreur de récupération de la page : {e}", None
57
-
58
- html_text = response.text
59
- mp3_entries = extract_mp3_links_and_titles(html_text)
60
-
61
- if not mp3_entries:
62
- return "Aucun fichier MP3 trouvé sur la page.", None
63
-
64
- temp_dir = Path("temp_episodes")
65
- temp_dir.mkdir(exist_ok=True)
66
- zip_path = temp_dir / "episodes_radiofrance.zip"
67
-
68
- used_filenames = set()
69
-
70
- with zipfile.ZipFile(zip_path, "w") as zipf:
71
- for idx, (mp3_url, title) in enumerate(mp3_entries, 1):
72
- base_name = f"{idx:02d}-" + (sanitize_filename(title) if title else "episode")
73
- filename = base_name + ".mp3"
74
-
75
- # Assurer l'unicité du nom de fichier
76
- counter = 1
77
- while filename in used_filenames:
78
- filename = f"{base_name}_{counter}.mp3"
79
- counter += 1
80
- used_filenames.add(filename)
81
-
82
- try:
83
- print(f"Téléchargement : {mp3_url}")
84
- r = requests.get(mp3_url, stream=True)
85
- r.raise_for_status()
86
-
87
- mp3_path = temp_dir / filename
88
- with open(mp3_path, "wb") as f:
89
- for chunk in r.iter_content(chunk_size=8192):
90
- f.write(chunk)
91
-
92
- zipf.write(mp3_path, arcname=filename)
93
- mp3_path.unlink() # Supprime le fichier après ajout au ZIP
94
-
95
- except Exception as e:
96
- print(f"Erreur lors du téléchargement de {mp3_url} : {e}")
97
-
98
- return "Téléchargement terminé !", str(zip_path)
99
-
100
- # Interface Gradio
101
- with gr.Blocks() as demo:
102
- gr.Markdown("## 🎧 Téléchargeur de podcasts Radio France")
103
- with gr.Row():
104
- url_input = gr.Textbox(label="URL de la page", placeholder="Collez ici une URL d'une page de podcast")
105
- download_btn = gr.Button("Télécharger les MP3 et générer un .zip")
106
- status = gr.Textbox(label="Statut")
107
- file_output = gr.File(label="Fichier ZIP à télécharger")
108
-
109
- download_btn.click(download_podcasts, inputs=url_input, outputs=[status, file_output])
110
 
111
- # Lancement (utile pour Hugging Face)
112
  if __name__ == "__main__":
113
- demo.launch()
 
1
+ import requests
2
+ import re
3
+ import os
4
+ import zipfile
5
+ import tempfile
6
+ import gradio as gr
7
+ from pathlib import Path
8
+ from bs4 import BeautifulSoup
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def sanitize_filename(name):
11
+ return re.sub(r'[\\/*?:"<>|]', "", name).strip().replace(" ", "_")[:100]
12
+
13
+ def extract_episode_links(html_text, base_url):
14
+ soup = BeautifulSoup(html_text, "html.parser")
15
+ episodes = []
16
+ seen_urls = set()
17
+
18
+ # Analyse de chaque bloc d'épisode
19
+ for audio_tag in soup.find_all("audio"):
20
+ source = audio_tag.find("source")
21
+ if source and source.get("src", "").endswith(".mp3"):
22
+ title = (
23
+ audio_tag.get("aria-label")
24
+ or audio_tag.get("title")
25
+ or source.get("title")
26
+ or "episode"
27
+ )
28
+ url = source["src"]
29
+ if not url.startswith("http"):
30
+ url = requests.compat.urljoin(base_url, url)
31
+
32
+ if url not in seen_urls:
33
+ seen_urls.add(url)
34
+ episodes.append((title, url))
35
+
36
+ return episodes
37
+
38
+ def download_podcast_series(url):
39
  try:
40
+ r = requests.get(url, timeout=10)
41
+ r.raise_for_status()
42
  except Exception as e:
43
+ return f"Erreur lors du chargement de la page : {e}", None
44
+
45
+ html_text = r.text
46
+ episodes = extract_episode_links(html_text, url)
47
+
48
+ if not episodes:
49
+ return "Aucun épisode audio trouvé sur la page.", None
50
+
51
+ with tempfile.TemporaryDirectory() as temp_dir:
52
+ zip_path = os.path.join(temp_dir, "podcast.zip")
53
+ used_filenames = set()
54
+
55
+ with zipfile.ZipFile(zip_path, "w") as zipf:
56
+ for idx, (title, mp3_url) in enumerate(episodes, 1):
57
+ base_name = f"{idx:02d}-" + sanitize_filename(title)
58
+ filename = base_name + ".mp3"
59
+
60
+ # Éviter les doublons de nom
61
+ count = 1
62
+ while filename in used_filenames:
63
+ filename = f"{base_name}_{count}.mp3"
64
+ count += 1
65
+ used_filenames.add(filename)
66
+
67
+ try:
68
+ audio = requests.get(mp3_url, stream=True, timeout=15)
69
+ audio.raise_for_status()
70
+ temp_mp3_path = os.path.join(temp_dir, filename)
71
+ with open(temp_mp3_path, "wb") as f:
72
+ for chunk in audio.iter_content(8192):
73
+ f.write(chunk)
74
+ zipf.write(temp_mp3_path, arcname=filename)
75
+ except Exception as e:
76
+ print(f"Erreur lors du téléchargement de {mp3_url} : {e}")
77
+
78
+ return "Téléchargement terminé !", zip_path
79
+
80
+ interface = gr.Interface(
81
+ fn=download_podcast_series,
82
+ inputs=gr.Textbox(label="URL du podcast radio (ex: France Culture)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/serie-le-capitaine-fracasse-de-theophile-gautier"),
83
+ outputs=[
84
+ gr.Textbox(label="Statut"),
85
+ gr.File(label="Fichier ZIP des épisodes")
86
+ ],
87
+ title="Téléchargeur de Podcast Radio (.mp3)",
88
+ description="Collez un lien vers une série de podcast Radio France (ex: France Culture). Seuls les fichiers .mp3 correspondant aux épisodes seront extraits et regroupés dans un fichier ZIP téléchargeable.",
89
+ allow_flagging="never"
90
+ )
 
 
 
 
 
 
91
 
 
92
  if __name__ == "__main__":
93
+ interface.launch()