Ribot commited on
Commit
31f35d4
·
verified ·
1 Parent(s): 52a320c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -33
app.py CHANGED
@@ -5,20 +5,28 @@ import os
5
  import zipfile
6
  import tempfile
7
  from urllib.parse import urljoin
 
8
  from mutagen.mp3 import MP3
9
- from mutagen.id3 import ID3, TIT2
10
 
11
  def get_clean_title(filepath):
12
  try:
13
  audio = MP3(filepath, ID3=ID3)
14
- if 'TIT2' in audio:
 
 
 
15
  title = audio['TIT2'].text[0]
16
- # Nettoyage des caractères spéciaux
17
- title = re.sub(r'[\\/*?:"<>|]', "", title).strip()
18
- return title
19
- except Exception as e:
20
- print(f"Erreur lecture métadonnées : {str(e)}")
21
- return os.path.basename(filepath).split('.')[0]
 
 
 
 
22
 
23
  def process_url(url):
24
  try:
@@ -26,15 +34,15 @@ def process_url(url):
26
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
27
  'Referer': 'https://www.radiofrance.fr/'
28
  }
29
- response = requests.get(url, headers=headers)
30
  response.raise_for_status()
31
  except Exception as e:
32
  return None, f"Erreur de connexion : {str(e)}"
33
 
34
- # Extraction ciblée des épisodes
35
  soup = BeautifulSoup(response.text, 'html.parser')
36
  main_content = soup.find('main') or soup
37
- episodes = main_content.find_all('article', class_=re.compile(r'episode|podcast'))
38
 
39
  mp3_links = []
40
  for episode in episodes:
@@ -42,31 +50,34 @@ def process_url(url):
42
  if script_tag:
43
  match = re.search(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script_tag.string)
44
  if match:
45
- mp3_url = urljoin(url, match.group(1).split('?')[0])
46
- mp3_links.append(mp3_url)
 
47
 
48
- # Filtrage des doublons
49
- mp3_links = list(dict.fromkeys(mp3_links))[:4] # Limite aux 4 premiers épisodes
 
 
50
 
51
  if not mp3_links:
52
- return None, "Aucun épisode principal trouvé"
53
 
54
  temp_dir = tempfile.mkdtemp()
55
  filenames = []
56
 
57
  for idx, mp3_url in enumerate(mp3_links, 1):
58
  try:
59
- # Téléchargement original
60
- original_name = os.path.basename(mp3_url).split('?')[0]
61
- temp_path = os.path.join(temp_dir, f"temp_{idx}_{original_name}")
62
 
63
- with requests.get(mp3_url, headers=headers, stream=True, timeout=10) as r:
64
  r.raise_for_status()
65
  with open(temp_path, 'wb') as f:
66
  for chunk in r.iter_content(chunk_size=8192):
67
  f.write(chunk)
68
 
69
- # Renommage avec métadonnées
70
  clean_title = get_clean_title(temp_path)
71
  final_name = f"{idx:02d} - {clean_title}.mp3"
72
  final_path = os.path.join(temp_dir, final_name)
@@ -77,10 +88,10 @@ def process_url(url):
77
  continue
78
 
79
  if not filenames:
80
- return None, "Échec du téléchargement des épisodes"
81
 
82
- # Création du ZIP
83
- zip_path = os.path.join(temp_dir, 'podcast_episodes.zip')
84
  with zipfile.ZipFile(zip_path, 'w') as zipf:
85
  for file in filenames:
86
  zipf.write(file, arcname=os.path.basename(file))
@@ -93,23 +104,22 @@ def download_podcast(url):
93
  raise gr.Error(error)
94
  return zip_path
95
 
96
- with gr.Blocks(title="Podcast Clean Downloader") as app:
97
- gr.Markdown("## 🎙️ Téléchargeur Intelligent de Podcasts")
98
  with gr.Row():
99
  url_input = gr.Textbox(
100
- label="URL Radio France",
101
- placeholder="Collez ici l'URL de la série podcast...",
102
  max_lines=1
103
  )
104
- btn = gr.Button("Générer le ZIP des épisodes", variant="primary")
105
- output = gr.File(label="Télécharger les épisodes")
106
 
107
  examples = gr.Examples(
108
  examples=[[
109
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
110
  ]],
111
- inputs=[url_input],
112
- label="Exemple fonctionnel"
113
  )
114
 
115
  btn.click(
@@ -119,4 +129,4 @@ with gr.Blocks(title="Podcast Clean Downloader") as app:
119
  api_name="download"
120
  )
121
 
122
- app.launch(show_error=True, share=False)
 
5
  import zipfile
6
  import tempfile
7
  from urllib.parse import urljoin
8
+ from bs4 import BeautifulSoup
9
  from mutagen.mp3 import MP3
10
+ from mutagen.id3 import ID3, TIT3
11
 
12
  def get_clean_title(filepath):
13
  try:
14
  audio = MP3(filepath, ID3=ID3)
15
+ # Utilisation de TIT3 (Subtitle) qui contient souvent le numéro d'épisode
16
+ if 'TIT3' in audio:
17
+ title = audio['TIT3'].text[0]
18
+ elif 'TIT2' in audio:
19
  title = audio['TIT2'].text[0]
20
+ else:
21
+ return os.path.basename(filepath)
22
+
23
+ # Nettoyage avancé du titre
24
+ title = re.sub(r'\s*-\s*Radio France$', '', title, flags=re.IGNORECASE)
25
+ title = re.sub(r'[\\/*?:"<>|]', '', title).strip()
26
+ return title[:100] # Limite la longueur du nom de fichier
27
+
28
+ except Exception:
29
+ return os.path.basename(filepath).split('.')[0]
30
 
31
  def process_url(url):
32
  try:
 
34
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
35
  'Referer': 'https://www.radiofrance.fr/'
36
  }
37
+ response = requests.get(url, headers=headers, timeout=15)
38
  response.raise_for_status()
39
  except Exception as e:
40
  return None, f"Erreur de connexion : {str(e)}"
41
 
42
+ # Détection des épisodes avec BeautifulSoup
43
  soup = BeautifulSoup(response.text, 'html.parser')
44
  main_content = soup.find('main') or soup
45
+ episodes = main_content.find_all(['article', 'div'], class_=re.compile(r'episode|podcast|card', re.IGNORECASE))
46
 
47
  mp3_links = []
48
  for episode in episodes:
 
50
  if script_tag:
51
  match = re.search(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script_tag.string)
52
  if match:
53
+ mp3_url = urljoin(url, match.group(1).split('?')[0]
54
+ if mp3_url not in mp3_links:
55
+ mp3_links.append(mp3_url)
56
 
57
+ # Alternative si détection échoue
58
+ if not mp3_links:
59
+ matches = re.findall(r'(https://media\.radiofrance-podcast\.net[^"\']+?\.mp3)', response.text)
60
+ mp3_links = list(dict.fromkeys(matches))
61
 
62
  if not mp3_links:
63
+ return None, "Aucun épisode trouvé - Vérifiez l'URL"
64
 
65
  temp_dir = tempfile.mkdtemp()
66
  filenames = []
67
 
68
  for idx, mp3_url in enumerate(mp3_links, 1):
69
  try:
70
+ # Téléchargement temporaire
71
+ temp_name = f"temp_{idx}.mp3"
72
+ temp_path = os.path.join(temp_dir, temp_name)
73
 
74
+ with requests.get(mp3_url, headers=headers, stream=True, timeout=20) as r:
75
  r.raise_for_status()
76
  with open(temp_path, 'wb') as f:
77
  for chunk in r.iter_content(chunk_size=8192):
78
  f.write(chunk)
79
 
80
+ # Renommage final
81
  clean_title = get_clean_title(temp_path)
82
  final_name = f"{idx:02d} - {clean_title}.mp3"
83
  final_path = os.path.join(temp_dir, final_name)
 
88
  continue
89
 
90
  if not filenames:
91
+ return None, "Échec du téléchargement"
92
 
93
+ # Création ZIP
94
+ zip_path = os.path.join(temp_dir, 'podcast.zip')
95
  with zipfile.ZipFile(zip_path, 'w') as zipf:
96
  for file in filenames:
97
  zipf.write(file, arcname=os.path.basename(file))
 
104
  raise gr.Error(error)
105
  return zip_path
106
 
107
+ with gr.Blocks(title="RadioFrance Podcaster Pro") as app:
108
+ gr.Markdown("## 🎧 Téléchargeur Intelligent Radio France")
109
  with gr.Row():
110
  url_input = gr.Textbox(
111
+ label="URL de la série podcast",
112
+ placeholder="Ex: https://www.radiofrance.fr/...",
113
  max_lines=1
114
  )
115
+ btn = gr.Button("Générer le ZIP", variant="primary")
116
+ output = gr.File(label="Épisodes téléchargés")
117
 
118
  examples = gr.Examples(
119
  examples=[[
120
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
121
  ]],
122
+ inputs=[url_input]
 
123
  )
124
 
125
  btn.click(
 
129
  api_name="download"
130
  )
131
 
132
+ app.launch(show_error=True)