Ribot commited on
Commit
64b2d1d
·
verified ·
1 Parent(s): fea2cb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -47
app.py CHANGED
@@ -6,76 +6,71 @@ import zipfile
6
  import tempfile
7
  from urllib.parse import urljoin
8
  from bs4 import BeautifulSoup
9
- from mutagen.mp3 import MP3
10
- from mutagen.id3 import ID3
11
-
12
- def get_clean_title(filepath):
13
- try:
14
- audio = MP3(filepath, ID3=ID3)
15
- for tag in ['TIT2', 'TIT3', 'TALB']:
16
- if tag in audio:
17
- title = audio[tag].text[0]
18
- title = re.sub(r'[\\/*?:"<>|]', '', title).strip()
19
- return title[:100]
20
- return os.path.basename(filepath).split('.')[0]
21
- except Exception:
22
- return os.path.basename(filepath).split('.')[0]
23
 
24
  def process_url(url):
25
  try:
26
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
27
- response = requests.get(url, headers=headers, timeout=15)
 
 
 
28
  response.raise_for_status()
29
  except Exception as e:
30
- return None, f"Erreur : {str(e)}"
31
 
32
  soup = BeautifulSoup(response.text, 'html.parser')
33
  mp3_links = []
34
-
35
- # Méthode 1 : Balises JSON-LD
36
- for script in soup.find_all('script', type='application/ld+json'):
 
37
  if script.string:
38
  matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script.string)
39
  for match in matches:
40
- clean_url = urljoin(url, match.split('?')[0])
41
- if clean_url not in mp3_links:
42
- mp3_links.append(clean_url)
43
 
44
- # Méthode 2 : Fallback HTML
45
  if not mp3_links:
46
- for a in soup.find_all('a', href=re.compile(r'\.mp3')):
47
- mp3_url = urljoin(url, a['href'].split('?')[0])
48
- if mp3_url not in mp3_links:
49
- mp3_links.append(mp3_url)
50
 
 
51
  if not mp3_links:
52
- return None, "Aucun épisode trouvé"
 
 
 
 
 
 
 
53
 
 
 
 
 
54
  temp_dir = tempfile.mkdtemp()
55
  filenames = []
56
 
57
  for idx, mp3_url in enumerate(mp3_links, 1):
58
  try:
59
- temp_path = os.path.join(temp_dir, f"temp_{idx}.mp3")
 
60
 
61
- # Téléchargement
62
- with requests.get(mp3_url, headers=headers, stream=True, timeout=20) as r:
63
  r.raise_for_status()
64
- with open(temp_path, 'wb') as f:
65
  for chunk in r.iter_content(chunk_size=8192):
66
  f.write(chunk)
67
-
68
- # Renommage
69
- title = get_clean_title(temp_path)
70
- final_name = f"{idx:02d} - {title}.mp3"
71
- final_path = os.path.join(temp_dir, final_name)
72
- os.rename(temp_path, final_path)
73
- filenames.append(final_path)
74
  except Exception:
75
  continue
76
 
77
  if not filenames:
78
- return None, "Échec des téléchargements"
79
 
80
  zip_path = os.path.join(temp_dir, 'podcast.zip')
81
  with zipfile.ZipFile(zip_path, 'w') as zipf:
@@ -91,19 +86,28 @@ def download_podcast(url):
91
  return zip_path
92
 
93
  with gr.Blocks() as app:
94
- gr.Markdown("## 🎙️ Téléchargeur Radio France")
95
  with gr.Row():
96
- url = gr.Textbox(label="URL de la série", placeholder="Collez l'URL ici...")
97
- btn = gr.Button("Télécharger", variant="primary")
98
- output = gr.File(label="Épisodes")
 
 
 
 
99
 
100
  examples = gr.Examples(
101
  examples=[[
102
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
103
  ]],
104
- inputs=[url]
105
  )
106
 
107
- btn.click(download_podcast, inputs=url, outputs=output)
 
 
 
 
 
108
 
109
  app.launch()
 
6
  import tempfile
7
  from urllib.parse import urljoin
8
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def process_url(url):
11
  try:
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
+ 'Referer': 'https://www.radiofrance.fr/'
15
+ }
16
+ response = requests.get(url, headers=headers)
17
  response.raise_for_status()
18
  except Exception as e:
19
+ return None, f"Erreur de connexion : {str(e)}"
20
 
21
  soup = BeautifulSoup(response.text, 'html.parser')
22
  mp3_links = []
23
+
24
+ # Nouvelle méthode de détection
25
+ scripts = soup.find_all('script', type='application/ld+json')
26
+ for script in scripts:
27
  if script.string:
28
  matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script.string)
29
  for match in matches:
30
+ full_url = urljoin(url, match.split('?')[0])
31
+ if full_url not in mp3_links:
32
+ mp3_links.append(full_url)
33
 
34
+ # Fallback pour les URLs dans les attributs data
35
  if not mp3_links:
36
+ for tag in soup.find_all(attrs={"data-url": re.compile(r"\.mp3")}):
37
+ mp3_url = urljoin(url, tag['data-url'].split('?')[0])
38
+ mp3_links.append(mp3_url)
 
39
 
40
+ # Dernier recours : recherche globale
41
  if not mp3_links:
42
+ matches = re.findall(r'(https?://[^\s"\']+?\.mp3)', response.text)
43
+ for match in matches:
44
+ clean_url = urljoin(url, match.split('?')[0])
45
+ if clean_url not in mp3_links and 'podcast' in clean_url:
46
+ mp3_links.append(clean_url)
47
+
48
+ # Filtrage final
49
+ mp3_links = list(dict.fromkeys(mp3_links)) # Supprime les doublons
50
 
51
+ if not mp3_links:
52
+ return None, "Aucun épisode trouvé - Structure de page inconnue"
53
+
54
+ # Téléchargement et création ZIP
55
  temp_dir = tempfile.mkdtemp()
56
  filenames = []
57
 
58
  for idx, mp3_url in enumerate(mp3_links, 1):
59
  try:
60
+ filename = f"{idx:02d}_{os.path.basename(mp3_url)}"
61
+ filepath = os.path.join(temp_dir, filename)
62
 
63
+ with requests.get(mp3_url, headers=headers, stream=True) as r:
 
64
  r.raise_for_status()
65
+ with open(filepath, 'wb') as f:
66
  for chunk in r.iter_content(chunk_size=8192):
67
  f.write(chunk)
68
+ filenames.append(filepath)
 
 
 
 
 
 
69
  except Exception:
70
  continue
71
 
72
  if not filenames:
73
+ return None, "Échec du téléchargement"
74
 
75
  zip_path = os.path.join(temp_dir, 'podcast.zip')
76
  with zipfile.ZipFile(zip_path, 'w') as zipf:
 
86
  return zip_path
87
 
88
  with gr.Blocks() as app:
89
+ gr.Markdown("## 🎧 Téléchargeur Radio France")
90
  with gr.Row():
91
+ url_input = gr.Textbox(
92
+ label="URL du podcast",
93
+ placeholder="Ex: https://www.radiofrance.fr/...",
94
+ max_lines=1
95
+ )
96
+ btn = gr.Button("Télécharger les épisodes", variant="primary")
97
+ output = gr.File(label="Fichier ZIP")
98
 
99
  examples = gr.Examples(
100
  examples=[[
101
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
102
  ]],
103
+ inputs=[url_input]
104
  )
105
 
106
+ btn.click(
107
+ fn=download_podcast,
108
+ inputs=url_input,
109
+ outputs=output,
110
+ api_name="download"
111
+ )
112
 
113
  app.launch()