Ribot commited on
Commit
fea2cb5
·
verified ·
1 Parent(s): 31f35d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -63
app.py CHANGED
@@ -7,90 +7,76 @@ import tempfile
7
  from urllib.parse import urljoin
8
  from bs4 import BeautifulSoup
9
  from mutagen.mp3 import MP3
10
- from mutagen.id3 import ID3, TIT3
11
 
12
  def get_clean_title(filepath):
13
  try:
14
  audio = MP3(filepath, ID3=ID3)
15
- # Utilisation de TIT3 (Subtitle) qui contient souvent le numéro d'épisode
16
- if 'TIT3' in audio:
17
- title = audio['TIT3'].text[0]
18
- elif 'TIT2' in audio:
19
- title = audio['TIT2'].text[0]
20
- else:
21
- return os.path.basename(filepath)
22
-
23
- # Nettoyage avancé du titre
24
- title = re.sub(r'\s*-\s*Radio France$', '', title, flags=re.IGNORECASE)
25
- title = re.sub(r'[\\/*?:"<>|]', '', title).strip()
26
- return title[:100] # Limite la longueur du nom de fichier
27
-
28
  except Exception:
29
  return os.path.basename(filepath).split('.')[0]
30
 
31
  def process_url(url):
32
  try:
33
- headers = {
34
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
35
- 'Referer': 'https://www.radiofrance.fr/'
36
- }
37
  response = requests.get(url, headers=headers, timeout=15)
38
  response.raise_for_status()
39
  except Exception as e:
40
- return None, f"Erreur de connexion : {str(e)}"
41
 
42
- # Détection des épisodes avec BeautifulSoup
43
  soup = BeautifulSoup(response.text, 'html.parser')
44
- main_content = soup.find('main') or soup
45
- episodes = main_content.find_all(['article', 'div'], class_=re.compile(r'episode|podcast|card', re.IGNORECASE))
46
-
47
  mp3_links = []
48
- for episode in episodes:
49
- script_tag = episode.find('script', type='application/ld+json')
50
- if script_tag:
51
- match = re.search(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script_tag.string)
52
- if match:
53
- mp3_url = urljoin(url, match.group(1).split('?')[0]
54
- if mp3_url not in mp3_links:
55
- mp3_links.append(mp3_url)
 
56
 
57
- # Alternative si détection échoue
58
  if not mp3_links:
59
- matches = re.findall(r'(https://media\.radiofrance-podcast\.net[^"\']+?\.mp3)', response.text)
60
- mp3_links = list(dict.fromkeys(matches))
 
 
61
 
62
  if not mp3_links:
63
- return None, "Aucun épisode trouvé - Vérifiez l'URL"
64
 
65
  temp_dir = tempfile.mkdtemp()
66
  filenames = []
67
 
68
  for idx, mp3_url in enumerate(mp3_links, 1):
69
  try:
70
- # Téléchargement temporaire
71
- temp_name = f"temp_{idx}.mp3"
72
- temp_path = os.path.join(temp_dir, temp_name)
73
 
 
74
  with requests.get(mp3_url, headers=headers, stream=True, timeout=20) as r:
75
  r.raise_for_status()
76
  with open(temp_path, 'wb') as f:
77
  for chunk in r.iter_content(chunk_size=8192):
78
  f.write(chunk)
79
-
80
- # Renommage final
81
- clean_title = get_clean_title(temp_path)
82
- final_name = f"{idx:02d} - {clean_title}.mp3"
83
  final_path = os.path.join(temp_dir, final_name)
84
  os.rename(temp_path, final_path)
85
-
86
  filenames.append(final_path)
87
- except Exception as e:
88
  continue
89
 
90
  if not filenames:
91
- return None, "Échec du téléchargement"
92
 
93
- # Création ZIP
94
  zip_path = os.path.join(temp_dir, 'podcast.zip')
95
  with zipfile.ZipFile(zip_path, 'w') as zipf:
96
  for file in filenames:
@@ -104,29 +90,20 @@ def download_podcast(url):
104
  raise gr.Error(error)
105
  return zip_path
106
 
107
- with gr.Blocks(title="RadioFrance Podcaster Pro") as app:
108
- gr.Markdown("## 🎧 Téléchargeur Intelligent Radio France")
109
  with gr.Row():
110
- url_input = gr.Textbox(
111
- label="URL de la série podcast",
112
- placeholder="Ex: https://www.radiofrance.fr/...",
113
- max_lines=1
114
- )
115
- btn = gr.Button("Générer le ZIP", variant="primary")
116
- output = gr.File(label="Épisodes téléchargés")
117
 
118
  examples = gr.Examples(
119
  examples=[[
120
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
121
  ]],
122
- inputs=[url_input]
123
  )
124
 
125
- btn.click(
126
- fn=download_podcast,
127
- inputs=url_input,
128
- outputs=output,
129
- api_name="download"
130
- )
131
 
132
- app.launch(show_error=True)
 
7
  from urllib.parse import urljoin
8
  from bs4 import BeautifulSoup
9
  from mutagen.mp3 import MP3
10
+ from mutagen.id3 import ID3
11
 
12
  def get_clean_title(filepath):
13
  try:
14
  audio = MP3(filepath, ID3=ID3)
15
+ for tag in ['TIT2', 'TIT3', 'TALB']:
16
+ if tag in audio:
17
+ title = audio[tag].text[0]
18
+ title = re.sub(r'[\\/*?:"<>|]', '', title).strip()
19
+ return title[:100]
20
+ return os.path.basename(filepath).split('.')[0]
 
 
 
 
 
 
 
21
  except Exception:
22
  return os.path.basename(filepath).split('.')[0]
23
 
24
  def process_url(url):
25
  try:
26
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
 
 
 
27
  response = requests.get(url, headers=headers, timeout=15)
28
  response.raise_for_status()
29
  except Exception as e:
30
+ return None, f"Erreur : {str(e)}"
31
 
 
32
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
33
  mp3_links = []
34
+
35
+ # Méthode 1 : Balises JSON-LD
36
+ for script in soup.find_all('script', type='application/ld+json'):
37
+ if script.string:
38
+ matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script.string)
39
+ for match in matches:
40
+ clean_url = urljoin(url, match.split('?')[0])
41
+ if clean_url not in mp3_links:
42
+ mp3_links.append(clean_url)
43
 
44
+ # Méthode 2 : Fallback HTML
45
  if not mp3_links:
46
+ for a in soup.find_all('a', href=re.compile(r'\.mp3')):
47
+ mp3_url = urljoin(url, a['href'].split('?')[0])
48
+ if mp3_url not in mp3_links:
49
+ mp3_links.append(mp3_url)
50
 
51
  if not mp3_links:
52
+ return None, "Aucun épisode trouvé"
53
 
54
  temp_dir = tempfile.mkdtemp()
55
  filenames = []
56
 
57
  for idx, mp3_url in enumerate(mp3_links, 1):
58
  try:
59
+ temp_path = os.path.join(temp_dir, f"temp_{idx}.mp3")
 
 
60
 
61
+ # Téléchargement
62
  with requests.get(mp3_url, headers=headers, stream=True, timeout=20) as r:
63
  r.raise_for_status()
64
  with open(temp_path, 'wb') as f:
65
  for chunk in r.iter_content(chunk_size=8192):
66
  f.write(chunk)
67
+
68
+ # Renommage
69
+ title = get_clean_title(temp_path)
70
+ final_name = f"{idx:02d} - {title}.mp3"
71
  final_path = os.path.join(temp_dir, final_name)
72
  os.rename(temp_path, final_path)
 
73
  filenames.append(final_path)
74
+ except Exception:
75
  continue
76
 
77
  if not filenames:
78
+ return None, "Échec des téléchargements"
79
 
 
80
  zip_path = os.path.join(temp_dir, 'podcast.zip')
81
  with zipfile.ZipFile(zip_path, 'w') as zipf:
82
  for file in filenames:
 
90
  raise gr.Error(error)
91
  return zip_path
92
 
93
+ with gr.Blocks() as app:
94
+ gr.Markdown("## 🎙️ Téléchargeur Radio France")
95
  with gr.Row():
96
+ url = gr.Textbox(label="URL de la série", placeholder="Collez l'URL ici...")
97
+ btn = gr.Button("Télécharger", variant="primary")
98
+ output = gr.File(label="Épisodes")
 
 
 
 
99
 
100
  examples = gr.Examples(
101
  examples=[[
102
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
103
  ]],
104
+ inputs=[url]
105
  )
106
 
107
+ btn.click(download_podcast, inputs=url, outputs=output)
 
 
 
 
 
108
 
109
+ app.launch()