wuhp commited on
Commit
c0bc836
·
verified ·
1 Parent(s): c11b091

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -31
app.py CHANGED
@@ -2,6 +2,10 @@ import gradio as gr
2
  from internetarchive import search_items, get_item
3
  import requests
4
  import time
 
 
 
 
5
 
6
  # --- VirusTotal helper functions ---
7
  def scan_url_vt(url, api_key):
@@ -22,9 +26,35 @@ def scan_url_vt(url, api_key):
22
  stats = attr.get("stats", {})
23
  return stats.get("malicious", 0) == 0
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # --- Core search & scan logic ---
26
  def fetch_clean_videos(keywords, api_key, scan_enabled):
27
- # build IA query from comma-separated keywords
28
  query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
29
  ia_query = f"mediatype:(movies) AND ({query})"
30
  results = list(search_items(ia_query))[:50]
@@ -35,7 +65,7 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
35
  item = get_item(identifier)
36
  for f in item.files:
37
  fmt = f.get("format", "").lower()
38
- if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
39
  url = f"https://archive.org/download/{identifier}/{f['name']}"
40
  if scan_enabled and api_key:
41
  try:
@@ -50,63 +80,79 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
50
 
51
  # --- Gradio UI setup ---
52
  with gr.Blocks() as demo:
53
- gr.Markdown("# 📼 IA Drone‑Strike Video Browser with VT Scan + Raw Metadata")
54
  with gr.Row():
55
- kw_input = gr.Textbox(label="Search keywords (comma-separated)", value="drone strike, military uav, kamikaze drone")
56
- key_input = gr.Textbox(label="VirusTotal API Key", type="password")
57
- scan_toggle = gr.Checkbox(label="Enable VirusTotal scan", value=True)
58
- run_btn = gr.Button("Search & Scan")
 
59
 
60
- url_dropdown = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True, allow_custom_value=True)
61
- video_player = gr.Video(label="Video Player")
62
- metadata_json = gr.JSON(label="► Raw IA Metadata")
 
 
63
 
64
  def search_and_populate(keywords, api_key, scan_enabled):
65
  urls = fetch_clean_videos(keywords, api_key, scan_enabled)
66
  return gr.update(choices=urls, value=urls[0] if urls else None)
67
 
68
- def update_player_and_metadata(selected_url):
69
- # extract IA identifier from URL: ".../download/{identifier}/{filename}"
 
 
 
 
70
  parts = selected_url.split("/")
71
  identifier = parts[4] if len(parts) > 4 else None
72
-
73
- raw = {
74
- "identifier": identifier,
75
- "metadata": {},
76
- "files": []
77
- }
78
-
79
  if identifier:
80
  try:
81
  item = get_item(identifier)
82
- # full descriptive metadata
83
- raw["metadata"] = item.metadata
84
- # per-file metadata (name, format, size, md5, + any extra tags)
85
- raw["files"] = [
86
  {
87
  "name": f.get("name"),
88
  "format": f.get("format"),
89
  "size": f.get("size"),
90
  "md5": f.get("md5"),
91
- **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
92
  }
93
  for f in item.files
94
  ]
95
  except Exception:
96
- raw["error"] = "could not fetch IA metadata"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- return gr.update(value=selected_url), raw
99
 
100
- # wire up the interactions
101
  run_btn.click(
102
  fn=search_and_populate,
103
- inputs=[kw_input, key_input, scan_toggle],
104
  outputs=[url_dropdown]
105
  )
106
  url_dropdown.change(
107
- fn=update_player_and_metadata,
108
- inputs=[url_dropdown],
109
- outputs=[video_player, metadata_json]
110
  )
111
 
112
  if __name__ == "__main__":
 
2
  from internetarchive import search_items, get_item
3
  import requests
4
  import time
5
+ import subprocess
6
+ import json
7
+ import re
8
+ from bs4 import BeautifulSoup
9
 
10
  # --- VirusTotal helper functions ---
11
  def scan_url_vt(url, api_key):
 
26
  stats = attr.get("stats", {})
27
  return stats.get("malicious", 0) == 0
28
 
29
+ # --- FFprobe metadata extraction ---
30
+ def extract_ffprobe_metadata(url_or_path):
31
+ cmd = [
32
+ "ffprobe", "-v", "error", "-print_format", "json",
33
+ "-show_format", "-show_streams",
34
+ url_or_path
35
+ ]
36
+ out = subprocess.check_output(cmd)
37
+ return json.loads(out)
38
+
39
+ # --- Scrape basic page metadata (title + og: tags) ---
40
+ def fetch_page_metadata(url):
41
+ try:
42
+ resp = requests.get(url, timeout=5)
43
+ resp.raise_for_status()
44
+ html = resp.text
45
+ soup = BeautifulSoup(html, "html.parser")
46
+ meta = {"url": url, "title": soup.title.string if soup.title else None}
47
+ # grab OpenGraph tags
48
+ for tag in soup.find_all("meta"):
49
+ prop = tag.get("property") or tag.get("name")
50
+ if prop and prop.startswith(("og:", "twitter:")):
51
+ meta[prop] = tag.get("content")
52
+ return meta
53
+ except Exception as e:
54
+ return {"url": url, "error": str(e)}
55
+
56
  # --- Core search & scan logic ---
57
  def fetch_clean_videos(keywords, api_key, scan_enabled):
 
58
  query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
59
  ia_query = f"mediatype:(movies) AND ({query})"
60
  results = list(search_items(ia_query))[:50]
 
65
  item = get_item(identifier)
66
  for f in item.files:
67
  fmt = f.get("format", "").lower()
68
+ if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
69
  url = f"https://archive.org/download/{identifier}/{f['name']}"
70
  if scan_enabled and api_key:
71
  try:
 
80
 
81
  # --- Gradio UI setup ---
82
  with gr.Blocks() as demo:
83
+ gr.Markdown("# 📼 IA Drone‑Strike Explorer \nEnable VT scan, FFprobe & Origin Tracing")
84
  with gr.Row():
85
+ kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
86
+ vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
87
+ scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
88
+ ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
89
+ run_btn = gr.Button("Search & Scan")
90
 
91
+ url_dropdown = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
92
+ video_player = gr.Video(label="Video Player")
93
+ ia_meta_json = gr.JSON(label="► Raw IA Metadata")
94
+ ffprobe_json = gr.JSON(label="► FFprobe Metadata")
95
+ origins_json = gr.JSON(label="► Source‑Origin Metadata")
96
 
97
  def search_and_populate(keywords, api_key, scan_enabled):
98
  urls = fetch_clean_videos(keywords, api_key, scan_enabled)
99
  return gr.update(choices=urls, value=urls[0] if urls else None)
100
 
101
+ def update_all(selected_url, ff_on, api_key):
102
+ # no selection guard
103
+ if not selected_url:
104
+ return None, {}, {}, []
105
+
106
+ # 1) IA metadata + file list
107
  parts = selected_url.split("/")
108
  identifier = parts[4] if len(parts) > 4 else None
109
+ raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
 
 
 
 
 
 
110
  if identifier:
111
  try:
112
  item = get_item(identifier)
113
+ raw_ia["metadata"] = item.metadata
114
+ raw_ia["files"] = [
 
 
115
  {
116
  "name": f.get("name"),
117
  "format": f.get("format"),
118
  "size": f.get("size"),
119
  "md5": f.get("md5"),
120
+ **{k: v for k,v in f.items() if k not in ("name","format","size","md5")}
121
  }
122
  for f in item.files
123
  ]
124
  except Exception:
125
+ raw_ia["error"] = "could not fetch IA metadata"
126
+
127
+ # 2) FFprobe metadata if toggled
128
+ ff_md = {}
129
+ if ff_on:
130
+ try:
131
+ ff_md = extract_ffprobe_metadata(selected_url)
132
+ except Exception as e:
133
+ ff_md = {"error": str(e)}
134
+
135
+ # 3) Origin tracing: scrape each URL in description
136
+ origins = []
137
+ desc = raw_ia["metadata"].get("description", "")
138
+ urls_found = re.findall(r'https?://[^\s"<]+', desc)
139
+ for url in urls_found:
140
+ meta = fetch_page_metadata(url)
141
+ origins.append(meta)
142
+ # stop at first “real” origin (you can remove this break to collect all)
143
+ break
144
 
145
+ return selected_url, raw_ia, ff_md, origins
146
 
 
147
  run_btn.click(
148
  fn=search_and_populate,
149
+ inputs=[kw_input, vt_key_input, scan_toggle],
150
  outputs=[url_dropdown]
151
  )
152
  url_dropdown.change(
153
+ fn=update_all,
154
+ inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
155
+ outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
156
  )
157
 
158
  if __name__ == "__main__":