wuhp commited on
Commit
0804932
·
verified ·
1 Parent(s): 683437d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -94
app.py CHANGED
@@ -5,9 +5,11 @@ import time
5
  import subprocess
6
  import json
7
  import re
 
 
8
  from bs4 import BeautifulSoup
9
 
10
- # --- VirusTotal helper functions ---
11
  def scan_url_vt(url, api_key):
12
  headers = {"x-apikey": api_key}
13
  resp = requests.post(
@@ -23,10 +25,9 @@ def scan_url_vt(url, api_key):
23
  status_resp.raise_for_status()
24
  attr = status_resp.json()["data"]["attributes"]
25
  if attr.get("status") == "completed":
26
- stats = attr.get("stats", {})
27
- return stats.get("malicious", 0) == 0
28
 
29
- # --- FFprobe metadata extraction ---
30
  def extract_ffprobe_metadata(url_or_path):
31
  cmd = [
32
  "ffprobe", "-v", "error", "-print_format", "json",
@@ -36,124 +37,127 @@ def extract_ffprobe_metadata(url_or_path):
36
  out = subprocess.check_output(cmd)
37
  return json.loads(out)
38
 
39
- # --- Scrape basic page metadata (title + og: tags) ---
40
  def fetch_page_metadata(url):
41
  try:
42
  resp = requests.get(url, timeout=5)
43
  resp.raise_for_status()
44
- html = resp.text
45
- soup = BeautifulSoup(html, "html.parser")
46
  meta = {"url": url, "title": soup.title.string if soup.title else None}
47
- # grab OpenGraph tags
48
  for tag in soup.find_all("meta"):
49
  prop = tag.get("property") or tag.get("name")
50
  if prop and prop.startswith(("og:", "twitter:")):
51
  meta[prop] = tag.get("content")
 
 
 
52
  return meta
53
  except Exception as e:
54
  return {"url": url, "error": str(e)}
55
 
56
- # --- Core search & scan logic ---
57
- def fetch_clean_videos(keywords, api_key, scan_enabled):
58
- query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
 
59
  ia_query = f"mediatype:(movies) AND ({query})"
60
  results = list(search_items(ia_query))[:50]
61
-
62
- clean_urls = []
63
  for res in results:
64
- identifier = res["identifier"]
65
- item = get_item(identifier)
 
 
66
  for f in item.files:
67
- fmt = f.get("format", "").lower()
68
- if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
69
- url = f"https://archive.org/download/{identifier}/{f['name']}"
70
  if scan_enabled and api_key:
71
  try:
72
- is_clean = scan_url_vt(url, api_key)
73
- except Exception:
 
74
  continue
75
- else:
76
- is_clean = True
77
- if is_clean:
78
- clean_urls.append(url)
79
- return clean_urls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- # --- Gradio UI setup ---
82
  with gr.Blocks() as demo:
83
- gr.Markdown("# 📼 IA Drone‑Strike Explorer \nEnable VT scan, FFprobe & Origin Tracing")
84
  with gr.Row():
85
- kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
86
- vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
87
- scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
88
- ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
89
- run_btn = gr.Button("Search & Scan")
90
 
91
- url_dropdown = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
92
- video_player = gr.Video(label="Video Player")
93
- ia_meta_json = gr.JSON(label="► Raw IA Metadata")
94
- ffprobe_json = gr.JSON(label="FFprobe Metadata")
95
- origins_json = gr.JSON(label=" Source‑Origin Metadata")
 
96
 
97
- def search_and_populate(keywords, api_key, scan_enabled):
98
- urls = fetch_clean_videos(keywords, api_key, scan_enabled)
99
  return gr.update(choices=urls, value=urls[0] if urls else None)
100
 
101
- def update_all(selected_url, ff_on, api_key):
102
- # no selection guard
103
- if not selected_url:
104
- return None, {}, {}, []
105
-
106
- # 1) IA metadata + file list
107
- parts = selected_url.split("/")
108
- identifier = parts[4] if len(parts) > 4 else None
109
- raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
110
- if identifier:
111
- try:
112
- item = get_item(identifier)
113
- raw_ia["metadata"] = item.metadata
114
- raw_ia["files"] = [
115
- {
116
- "name": f.get("name"),
117
- "format": f.get("format"),
118
- "size": f.get("size"),
119
- "md5": f.get("md5"),
120
- **{k: v for k,v in f.items() if k not in ("name","format","size","md5")}
121
- }
122
- for f in item.files
123
- ]
124
- except Exception:
125
- raw_ia["error"] = "could not fetch IA metadata"
126
-
127
- # 2) FFprobe metadata if toggled
128
- ff_md = {}
129
- if ff_on:
130
- try:
131
- ff_md = extract_ffprobe_metadata(selected_url)
132
- except Exception as e:
133
- ff_md = {"error": str(e)}
134
 
135
- # 3) Origin tracing: scrape each URL in description
136
- origins = []
137
- desc = raw_ia["metadata"].get("description", "")
138
- urls_found = re.findall(r'https?://[^\s"<]+', desc)
139
- for url in urls_found:
140
- meta = fetch_page_metadata(url)
141
- origins.append(meta)
142
- # stop at first “real” origin (you can remove this break to collect all)
143
- break
144
-
145
- return selected_url, raw_ia, ff_md, origins
146
 
147
- run_btn.click(
148
- fn=search_and_populate,
149
- inputs=[kw_input, vt_key_input, scan_toggle],
150
- outputs=[url_dropdown]
151
- )
152
- url_dropdown.change(
153
- fn=update_all,
154
- inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
155
- outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
156
- )
157
 
158
- if __name__ == "__main__":
159
  demo.launch()
 
5
  import subprocess
6
  import json
7
  import re
8
+ import networkx as nx
9
+ from pyvis.network import Network
10
  from bs4 import BeautifulSoup
11
 
12
+ # --- VirusTotal helper ---
13
  def scan_url_vt(url, api_key):
14
  headers = {"x-apikey": api_key}
15
  resp = requests.post(
 
25
  status_resp.raise_for_status()
26
  attr = status_resp.json()["data"]["attributes"]
27
  if attr.get("status") == "completed":
28
+ return attr.get("stats", {}).get("malicious", 0) == 0
 
29
 
30
+ # --- FFprobe ---
31
  def extract_ffprobe_metadata(url_or_path):
32
  cmd = [
33
  "ffprobe", "-v", "error", "-print_format", "json",
 
37
  out = subprocess.check_output(cmd)
38
  return json.loads(out)
39
 
40
+ # --- Fetch page metadata + favicon ---
41
  def fetch_page_metadata(url):
42
  try:
43
  resp = requests.get(url, timeout=5)
44
  resp.raise_for_status()
45
+ soup = BeautifulSoup(resp.text, "html.parser")
 
46
  meta = {"url": url, "title": soup.title.string if soup.title else None}
 
47
  for tag in soup.find_all("meta"):
48
  prop = tag.get("property") or tag.get("name")
49
  if prop and prop.startswith(("og:", "twitter:")):
50
  meta[prop] = tag.get("content")
51
+ # favicon
52
+ icon = soup.find("link", rel=lambda x: x and "icon" in x)
53
+ meta["favicon"] = requests.compat.urljoin(url, icon.get("href")) if icon else None
54
  return meta
55
  except Exception as e:
56
  return {"url": url, "error": str(e)}
57
 
58
+ # --- IA search & filter raw footage ---
59
+ NEWS_STATIONS = ["cnn", "fox", "bbc", "nbc", "al jazeera", "rt "]
60
+ def fetch_raw_footage_urls(keywords, api_key, scan_enabled):
61
+ query = " OR ".join([kw.strip().replace(' ', '+') for kw in keywords.split(",")])
62
  ia_query = f"mediatype:(movies) AND ({query})"
63
  results = list(search_items(ia_query))[:50]
64
+ urls = []
 
65
  for res in results:
66
+ item = get_item(res['identifier'])
67
+ title = item.metadata.get("title", "").lower()
68
+ if any(ns in title for ns in NEWS_STATIONS):
69
+ continue
70
  for f in item.files:
71
+ fmt = f.get('format','').lower()
72
+ if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
73
+ url = f"https://archive.org/download/{res['identifier']}/{f['name']}"
74
  if scan_enabled and api_key:
75
  try:
76
+ if not scan_url_vt(url, api_key):
77
+ continue
78
+ except:
79
  continue
80
+ urls.append(url)
81
+ return urls
82
+
83
+ # --- Recursive origin tracing ---
84
+ def trace_origins(description, depth=0, max_depth=3, visited=None):
85
+ if visited is None: visited = set()
86
+ nodes = []
87
+ links = []
88
+ urls = re.findall(r'https?://[^\s"<]+', description)
89
+ for url in urls:
90
+ if url in visited: continue
91
+ visited.add(url)
92
+ meta = fetch_page_metadata(url)
93
+ nodes.append((url, meta))
94
+ if depth < max_depth and 'description' in meta:
95
+ sub_nodes, sub_links = trace_origins(meta.get('description',''), depth+1, max_depth, visited)
96
+ links.extend(sub_links)
97
+ nodes.extend(sub_nodes)
98
+ # link from origin to IA later
99
+ links.append((url, 'internet_archive'))
100
+ return nodes, links
101
+
102
+ # --- Build graph HTML via pyvis ---
103
+ def build_graph(nodes, links):
104
+ net = Network(height="400px", width="100%", directed=True)
105
+ for url, meta in nodes + [('internet_archive', {'title':'Internet Archive'})]:
106
+ label = meta.get('title') or url
107
+ favicon = meta.get('favicon')
108
+ net.add_node(url, label=label, title=json.dumps(meta), shape='image' if favicon else 'dot',
109
+ image=favicon if favicon else None)
110
+ for src, dst in links:
111
+ net.add_edge(src, dst)
112
+ net.force_atlas_2based()
113
+ return net.generate_html()
114
 
115
+ # --- Gradio UI ---
116
  with gr.Blocks() as demo:
117
+ gr.Markdown("# IA Drone‑Strike Chain Explorer")
118
  with gr.Row():
119
+ kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
120
+ vt_key = gr.Textbox(label="VirusTotal API Key", type="password")
121
+ scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
122
+ ffprobe_toggle = gr.Checkbox(label="Enable FFprobe", value=False)
123
+ run_btn = gr.Button("Search & Scan")
124
 
125
+ url_dd = gr.Dropdown(label="Raw Footage URLs", choices=[], interactive=True)
126
+ video = gr.Video(label="Player")
127
+ ia_meta = gr.JSON(label="IA Metadata")
128
+ ff_meta = gr.JSON(label="FFprobe Metadata")
129
+ graph_html = gr.HTML(label="Reupload Chain Graph")
130
+ origin_meta = gr.JSON(label="Clicked Origin Metadata")
131
 
132
+ def search_populate(kw, api_key, scan_on):
133
+ urls = fetch_raw_footage_urls(kw, api_key, scan_on)
134
  return gr.update(choices=urls, value=urls[0] if urls else None)
135
 
136
+ def on_select(url, ff_on, api_key):
137
+ if not url: return None, {}, {}, "", {}
138
+ # IA meta
139
+ parts = url.split('/')
140
+ ident = parts[4]
141
+ item = get_item(ident)
142
+ raw = {'metadata': item.metadata, 'files': [{k:v for k,v in f.items()} for f in item.files]}
143
+ # ffprobe
144
+ ff = extract_ffprobe_metadata(url) if ff_on else {}
145
+ # origin trace
146
+ nodes, links = trace_origins(item.metadata.get('description',''))
147
+ nodes.append(('internet_archive', {'title':'Internet Archive'}))
148
+ links = [(n[0],'internet_archive') for n in nodes if n[0] != 'internet_archive']
149
+ html = build_graph(nodes, links)
150
+ return url, raw, ff, html, {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ def on_click_node(node_id):
153
+ # find metadata in nodes list
154
+ # simplistic: refetch page
155
+ meta = fetch_page_metadata(node_id) if node_id != 'internet_archive' else {'title':'Internet Archive'}
156
+ return meta
 
 
 
 
 
 
157
 
158
+ run_btn.click(search_populate, [kw_input, vt_key, scan_toggle], [url_dd])
159
+ url_dd.change(on_select, [url_dd, ffprobe_toggle, vt_key], [video, ia_meta, ff_meta, graph_html, origin_meta])
160
+ graph_html.click(on_click_node, None, origin_meta)
 
 
 
 
 
 
 
161
 
162
+ if __name__ == '__main__':
163
  demo.launch()