wuhp commited on
Commit
373adae
·
verified ·
1 Parent(s): 8f43a39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -15
app.py CHANGED
@@ -6,6 +6,7 @@ import subprocess
6
  import json
7
  import re
8
  from bs4 import BeautifulSoup
 
9
 
10
  # --- News-station filter ---
11
  NEWS_FILTER = [
@@ -45,7 +46,6 @@ def extract_ffprobe_metadata(url_or_path):
45
  ]
46
  out = subprocess.check_output(cmd)
47
  md = json.loads(out)
48
- # compute a human-readable FPS for the first video stream
49
  for stream in md.get("streams", []):
50
  if stream.get("codec_type") == "video":
51
  avg_fr = stream.get("avg_frame_rate", "")
@@ -56,7 +56,7 @@ def extract_ffprobe_metadata(url_or_path):
56
  break
57
  return md
58
 
59
- # --- Scrape basic page metadata (title + og: tags) ---
60
  def fetch_page_metadata(url):
61
  try:
62
  resp = requests.get(url, timeout=5)
@@ -76,7 +76,17 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
76
  # build IA query
77
  query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
78
  ia_query = f"mediatype:(movies) AND ({query})"
79
- results = list(search_items(ia_query))[:50]
 
 
 
 
 
 
 
 
 
 
80
 
81
  clean_urls = []
82
  for res in results:
@@ -86,7 +96,11 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
86
  continue
87
 
88
  identifier = res["identifier"]
89
- item = get_item(identifier)
 
 
 
 
90
  for f in item.files:
91
  name = f.get("name", "").lower()
92
  # include common video file extensions
@@ -134,13 +148,8 @@ with gr.Blocks() as demo:
134
  item = get_item(identifier)
135
  raw_ia["metadata"] = item.metadata
136
  raw_ia["files"] = [
137
- {
138
- "name": f.get("name"),
139
- "format": f.get("format"),
140
- "size": f.get("size"),
141
- "md5": f.get("md5"),
142
- **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
143
- }
144
  for f in item.files
145
  ]
146
  except Exception:
@@ -159,13 +168,13 @@ with gr.Blocks() as demo:
159
  source_url = None
160
  meta = raw_ia.get("metadata", {})
161
 
162
- # check explicit metadata fields
163
  for key, val in meta.items():
164
  if key.lower() in ("source", "originalurl"):
165
  source_url = val[0] if isinstance(val, list) else val
166
  break
167
 
168
- # fallback: external-identifier
169
  if not source_url:
170
  for key, val in meta.items():
171
  if key.lower().startswith("external-identifier"):
@@ -178,14 +187,13 @@ with gr.Blocks() as demo:
178
  source_url = f"https://vimeo.com/{vid}"
179
  break
180
 
181
- # last resort: first URL in description
182
  if not source_url:
183
  desc = meta.get("description", "")
184
  found = re.findall(r"https?://[^\s\"<]+", desc)
185
  if found:
186
  source_url = found[0]
187
 
188
- # fetch page metadata for the source
189
  if source_url:
190
  origins.append(fetch_page_metadata(source_url))
191
 
 
6
  import json
7
  import re
8
  from bs4 import BeautifulSoup
9
+ from requests.exceptions import ReadTimeout
10
 
11
  # --- News-station filter ---
12
  NEWS_FILTER = [
 
46
  ]
47
  out = subprocess.check_output(cmd)
48
  md = json.loads(out)
 
49
  for stream in md.get("streams", []):
50
  if stream.get("codec_type") == "video":
51
  avg_fr = stream.get("avg_frame_rate", "")
 
56
  break
57
  return md
58
 
59
+ # --- Scrape basic page metadata ---
60
  def fetch_page_metadata(url):
61
  try:
62
  resp = requests.get(url, timeout=5)
 
76
  # build IA query
77
  query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
78
  ia_query = f"mediatype:(movies) AND ({query})"
79
+ # robust search with retries
80
+ max_attempts = 3
81
+ for attempt in range(max_attempts):
82
+ try:
83
+ results = list(search_items(ia_query, rows=50))[:50]
84
+ break
85
+ except ReadTimeout:
86
+ if attempt < max_attempts - 1:
87
+ time.sleep(2 ** attempt)
88
+ else:
89
+ results = []
90
 
91
  clean_urls = []
92
  for res in results:
 
96
  continue
97
 
98
  identifier = res["identifier"]
99
+ try:
100
+ item = get_item(identifier)
101
+ except Exception:
102
+ continue
103
+
104
  for f in item.files:
105
  name = f.get("name", "").lower()
106
  # include common video file extensions
 
148
  item = get_item(identifier)
149
  raw_ia["metadata"] = item.metadata
150
  raw_ia["files"] = [
151
+ {"name": f.get("name"), "format": f.get("format"), "size": f.get("size"), "md5": f.get("md5"),
152
+ **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}}
 
 
 
 
 
153
  for f in item.files
154
  ]
155
  except Exception:
 
168
  source_url = None
169
  meta = raw_ia.get("metadata", {})
170
 
171
+ # explicit fields
172
  for key, val in meta.items():
173
  if key.lower() in ("source", "originalurl"):
174
  source_url = val[0] if isinstance(val, list) else val
175
  break
176
 
177
+ # fallback identifiers
178
  if not source_url:
179
  for key, val in meta.items():
180
  if key.lower().startswith("external-identifier"):
 
187
  source_url = f"https://vimeo.com/{vid}"
188
  break
189
 
190
+ # description fallback
191
  if not source_url:
192
  desc = meta.get("description", "")
193
  found = re.findall(r"https?://[^\s\"<]+", desc)
194
  if found:
195
  source_url = found[0]
196
 
 
197
  if source_url:
198
  origins.append(fetch_page_metadata(source_url))
199