Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18

Commit

0804932

verified ·

1 Parent(s): 683437d

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -94

app.py CHANGED Viewed

@@ -5,9 +5,11 @@ import time
 import subprocess
 import json
 import re
 from bs4 import BeautifulSoup
-# --- VirusTotal helper functions ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = requests.post(
@@ -23,10 +25,9 @@ def scan_url_vt(url, api_key):
         status_resp.raise_for_status()
         attr = status_resp.json()["data"]["attributes"]
         if attr.get("status") == "completed":
-            stats = attr.get("stats", {})
-            return stats.get("malicious", 0) == 0
-# --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
     cmd = [
         "ffprobe", "-v", "error", "-print_format", "json",
@@ -36,124 +37,127 @@ def extract_ffprobe_metadata(url_or_path):
     out = subprocess.check_output(cmd)
     return json.loads(out)
-# --- Scrape basic page metadata (title + og: tags) ---
 def fetch_page_metadata(url):
     try:
         resp = requests.get(url, timeout=5)
         resp.raise_for_status()
-        html = resp.text
-        soup = BeautifulSoup(html, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
-        # grab OpenGraph tags
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
                 meta[prop] = tag.get("content")
         return meta
     except Exception as e:
         return {"url": url, "error": str(e)}
-# --- Core search & scan logic ---
-def fetch_clean_videos(keywords, api_key, scan_enabled):
-    query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
     results = list(search_items(ia_query))[:50]
-    clean_urls = []
     for res in results:
-        identifier = res["identifier"]
-        item = get_item(identifier)
         for f in item.files:
-            fmt = f.get("format", "").lower()
-            if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
-                url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
-                        is_clean = scan_url_vt(url, api_key)
-                    except Exception:
                         continue
-                else:
-                    is_clean = True
-                if is_clean:
-                    clean_urls.append(url)
-    return clean_urls
-# --- Gradio UI setup ---
 with gr.Blocks() as demo:
-    gr.Markdown("# 📼 IA Drone‑Strike Explorer  \nEnable VT scan, FFprobe & Origin Tracing")
     with gr.Row():
-        kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
-        vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
-    scan_toggle    = gr.Checkbox(label="Enable VT scan", value=True)
-    ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
-    run_btn        = gr.Button("Search & Scan")
-    url_dropdown   = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
-    video_player   = gr.Video(label="Video Player")
-    ia_meta_json   = gr.JSON(label="► Raw IA Metadata")
-    ffprobe_json   = gr.JSON(label="► FFprobe Metadata")
-    origins_json   = gr.JSON(label="► Source‑Origin Metadata")
-    def search_and_populate(keywords, api_key, scan_enabled):
-        urls = fetch_clean_videos(keywords, api_key, scan_enabled)
         return gr.update(choices=urls, value=urls[0] if urls else None)
-    def update_all(selected_url, ff_on, api_key):
-        # no selection guard
-        if not selected_url:
-            return None, {}, {}, []
-        # 1) IA metadata + file list
-        parts = selected_url.split("/")
-        identifier = parts[4] if len(parts) > 4 else None
-        raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
-        if identifier:
-            try:
-                item = get_item(identifier)
-                raw_ia["metadata"] = item.metadata
-                raw_ia["files"] = [
-                    {
-                        "name": f.get("name"),
-                        "format": f.get("format"),
-                        "size": f.get("size"),
-                        "md5": f.get("md5"),
-                        **{k: v for k,v in f.items() if k not in ("name","format","size","md5")}
-                    }
-                    for f in item.files
-                ]
-            except Exception:
-                raw_ia["error"] = "could not fetch IA metadata"
-        # 2) FFprobe metadata if toggled
-        ff_md = {}
-        if ff_on:
-            try:
-                ff_md = extract_ffprobe_metadata(selected_url)
-            except Exception as e:
-                ff_md = {"error": str(e)}
-        # 3) Origin tracing: scrape each URL in description
-        origins = []
-        desc = raw_ia["metadata"].get("description", "")
-        urls_found = re.findall(r'https?://[^\s"<]+', desc)
-        for url in urls_found:
-            meta = fetch_page_metadata(url)
-            origins.append(meta)
-            # stop at first “real” origin (you can remove this break to collect all)
-            break
-        return selected_url, raw_ia, ff_md, origins
-    run_btn.click(
-        fn=search_and_populate,
-        inputs=[kw_input, vt_key_input, scan_toggle],
-        outputs=[url_dropdown]
-    )
-    url_dropdown.change(
-        fn=update_all,
-        inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
-        outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
-    )
-if __name__ == "__main__":
     demo.launch()

 import subprocess
 import json
 import re
+import networkx as nx
+from pyvis.network import Network
 from bs4 import BeautifulSoup
+# --- VirusTotal helper ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = requests.post(
         status_resp.raise_for_status()
         attr = status_resp.json()["data"]["attributes"]
         if attr.get("status") == "completed":
+            return attr.get("stats", {}).get("malicious", 0) == 0
+# --- FFprobe ---
 def extract_ffprobe_metadata(url_or_path):
     cmd = [
         "ffprobe", "-v", "error", "-print_format", "json",
     out = subprocess.check_output(cmd)
     return json.loads(out)
+# --- Fetch page metadata + favicon ---
 def fetch_page_metadata(url):
     try:
         resp = requests.get(url, timeout=5)
         resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
                 meta[prop] = tag.get("content")
+        # favicon
+        icon = soup.find("link", rel=lambda x: x and "icon" in x)
+        meta["favicon"] = requests.compat.urljoin(url, icon.get("href")) if icon else None
         return meta
     except Exception as e:
         return {"url": url, "error": str(e)}
+# --- IA search & filter raw footage ---
+NEWS_STATIONS = ["cnn", "fox", "bbc", "nbc", "al jazeera", "rt "]
+def fetch_raw_footage_urls(keywords, api_key, scan_enabled):
+    query = " OR ".join([kw.strip().replace(' ', '+') for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
     results = list(search_items(ia_query))[:50]
+    urls = []
     for res in results:
+        item = get_item(res['identifier'])
+        title = item.metadata.get("title", "").lower()
+        if any(ns in title for ns in NEWS_STATIONS):
+            continue
         for f in item.files:
+            fmt = f.get('format','').lower()
+            if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
+                url = f"https://archive.org/download/{res['identifier']}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
+                        if not scan_url_vt(url, api_key):
+                            continue
+                    except:
                         continue
+                urls.append(url)
+    return urls
+# --- Recursive origin tracing ---
+def trace_origins(description, depth=0, max_depth=3, visited=None):
+    if visited is None: visited = set()
+    nodes = []
+    links = []
+    urls = re.findall(r'https?://[^\s"<]+', description)
+    for url in urls:
+        if url in visited: continue
+        visited.add(url)
+        meta = fetch_page_metadata(url)
+        nodes.append((url, meta))
+        if depth < max_depth and 'description' in meta:
+            sub_nodes, sub_links = trace_origins(meta.get('description',''), depth+1, max_depth, visited)
+            links.extend(sub_links)
+            nodes.extend(sub_nodes)
+        # link from origin to IA later
+        links.append((url, 'internet_archive'))
+    return nodes, links
+# --- Build graph HTML via pyvis ---
+def build_graph(nodes, links):
+    net = Network(height="400px", width="100%", directed=True)
+    for url, meta in nodes + [('internet_archive', {'title':'Internet Archive'})]:
+        label = meta.get('title') or url
+        favicon = meta.get('favicon')
+        net.add_node(url, label=label, title=json.dumps(meta), shape='image' if favicon else 'dot',
+                     image=favicon if favicon else None)
+    for src, dst in links:
+        net.add_edge(src, dst)
+    net.force_atlas_2based()
+    return net.generate_html()
+# --- Gradio UI ---
 with gr.Blocks() as demo:
+    gr.Markdown("# IA Drone‑Strike Chain Explorer")
     with gr.Row():
+        kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
+        vt_key   = gr.Textbox(label="VirusTotal API Key", type="password")
+    scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
+    ffprobe_toggle = gr.Checkbox(label="Enable FFprobe", value=False)
+    run_btn = gr.Button("Search & Scan")
+    url_dd = gr.Dropdown(label="Raw Footage URLs", choices=[], interactive=True)
+    video = gr.Video(label="Player")
+    ia_meta = gr.JSON(label="IA Metadata")
+    ff_meta = gr.JSON(label="FFprobe Metadata")
+    graph_html = gr.HTML(label="Reupload Chain Graph")
+    origin_meta = gr.JSON(label="Clicked Origin Metadata")
+    def search_populate(kw, api_key, scan_on):
+        urls = fetch_raw_footage_urls(kw, api_key, scan_on)
         return gr.update(choices=urls, value=urls[0] if urls else None)
+    def on_select(url, ff_on, api_key):
+        if not url: return None, {}, {}, "", {}
+        # IA meta
+        parts = url.split('/')
+        ident = parts[4]
+        item = get_item(ident)
+        raw = {'metadata': item.metadata, 'files': [{k:v for k,v in f.items()} for f in item.files]}
+        # ffprobe
+        ff = extract_ffprobe_metadata(url) if ff_on else {}
+        # origin trace
+        nodes, links = trace_origins(item.metadata.get('description',''))
+        nodes.append(('internet_archive', {'title':'Internet Archive'}))
+        links = [(n[0],'internet_archive') for n in nodes if n[0] != 'internet_archive']
+        html = build_graph(nodes, links)
+        return url, raw, ff, html, {}
+    def on_click_node(node_id):
+        # find metadata in nodes list
+        # simplistic: refetch page
+        meta = fetch_page_metadata(node_id) if node_id != 'internet_archive' else {'title':'Internet Archive'}
+        return meta
+    run_btn.click(search_populate, [kw_input, vt_key, scan_toggle], [url_dd])
+    url_dd.change(on_select, [url_dd, ffprobe_toggle, vt_key], [video, ia_meta, ff_meta, graph_html, origin_meta])
+    graph_html.click(on_click_node, None, origin_meta)
+if __name__ == '__main__':
     demo.launch()