amezi commited on
Commit
58ac08a
·
1 Parent(s): 5c52a79

Adding skeleton highlights creator

Browse files
README.md CHANGED
@@ -10,4 +10,13 @@ pinned: false
10
  short_description: Soccer Word-Based-Search Football Highlights Generator
11
  ---
12
 
 
 
 
 
 
 
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
10
  short_description: Soccer Word-Based-Search Football Highlights Generator
11
  ---
12
 
13
+ This is the main app module for a word-search-based football highlights extractor.
14
+
15
+ This starts out with a gradio web app to upload a video.
16
+ The video then gets labelled using a variety of input + a labeller LLM hosted on TogetherAI.
17
+ Then the app calls a model space hosted on HuggingFace and embeds the video and the labels with InternV
18
+ Then through the gradio web app UI search, we can query the mf.
19
+
20
+ I get by with a little help from my friends
21
+
22
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from src.pipeline import run_pipeline, search_highlights
4
+
5
+ def extract(video_file, game_card_str):
6
+ video_file.save("uploaded.mp4")
7
+ result = run_pipeline("uploaded.mp4", game_card_str)
8
+ return json.dumps(result, indent=2)
9
+
10
+ def search(query):
11
+ return "\n".join(search_highlights(query))
12
+
13
+ with gr.Blocks() as demo:
14
+ gr.Markdown("# Soccer Highlight Extractor")
15
+
16
+ with gr.Tab("Extract Highlights"):
17
+ video = gr.File(label="Upload Video")
18
+ game_card = gr.Textbox(label="Paste Game Card (JSON)", lines=10)
19
+ result = gr.Textbox(label="Pipeline Output")
20
+ gr.Button("Run Extraction").click(extract, [video, game_card], result)
21
+
22
+ with gr.Tab("Search Highlights"):
23
+ query = gr.Textbox(label="Search Query")
24
+ output = gr.Textbox(label="Search Results")
25
+ gr.Button("Search").click(search, query, output)
26
+
27
+ demo.launch()
data/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Placeholder to save folder
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ whisper
3
+ roboflow
4
+ requests
5
+ pinecone-client
6
+ ffmpeg-python
7
+ transformers
8
+ torch
9
+ decord
10
+ numpy
11
+ python-dotenv
12
+ together
src/embedder.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from transformers import AutoProcessor, AutoModel
4
+ import decord
5
+
6
+ class InternVLEmbedder:
7
+ def __init__(self):
8
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ self.model = AutoModel.from_pretrained("OpenGVLab/InternVL2_5-8B-MPO").to(self.device)
10
+ self.processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL2_5-8B-MPO")
11
+
12
+ def embed_video(self, video_path):
13
+ vr = decord.VideoReader(video_path)
14
+ frames = np.stack([vr[i].asnumpy() for i in np.linspace(0, len(vr)-1, 8).astype(int)])
15
+ tensor = torch.tensor(frames).permute(0, 3, 1, 2).unsqueeze(0).to(self.device)
16
+
17
+ with torch.no_grad():
18
+ video_vector = self.model.get_video_features(tensor).squeeze(0).cpu().numpy()
19
+
20
+ return video_vector / np.linalg.norm(video_vector)
21
+
22
+ def embed_text(self, text):
23
+ inputs = self.processor(text=[text], return_tensors="pt").to(self.device)
24
+
25
+ with torch.no_grad():
26
+ text_vector = self.model.get_text_features(**inputs).squeeze(0).cpu().numpy()
27
+
28
+ return text_vector / np.linalg.norm(text_vector)
src/event_card.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def parse_game_card(game_card_str):
4
+ try:
5
+ return json.loads(game_card_str)
6
+ except json.JSONDecodeError:
7
+ return {"description": game_card_str}
src/labeler.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from together import Together
3
+
4
+ class TogetherLLMLabeler:
5
+ def __init__(self):
6
+ self.client = Together(api_key=os.getenv("TOGETHER_API_KEY"))
7
+
8
+ def generate_label(self, game_card, transcript, spatial_context, frame_urls):
9
+ prompt = f"""
10
+ Game Information:
11
+ {game_card}
12
+
13
+ Commentary:
14
+ {transcript}
15
+
16
+ Spatial Context (object detections per frame):
17
+ {spatial_context}
18
+
19
+ Instructions:
20
+ - Summarize this event in factual soccer terminology.
21
+ - Focus on the play's significance to the score.
22
+ - Avoid exaggeration.
23
+ """
24
+
25
+ images = [{"type": "image_url", "image_url": {"url": url}} for url in frame_urls]
26
+
27
+ content = [{"type": "text", "text": prompt}] + images
28
+
29
+ response = self.client.chat_completions.create(
30
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
31
+ messages=[{"role": "user", "content": content}],
32
+ max_tokens=200
33
+ )
34
+
35
+ return response.choices[0].message["content"].strip()
src/pinecone_store.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pinecone
2
+ import os
3
+
4
+ class PineconeStore:
5
+ def __init__(self):
6
+ api_key = os.getenv("PINECONE_API_KEY")
7
+ environment = os.getenv("PINECONE_ENV")
8
+ pinecone.init(api_key=api_key, environment=environment)
9
+
10
+ self.index_name = "soccer-highlights"
11
+ if self.index_name not in pinecone.list_indexes():
12
+ pinecone.create_index(
13
+ name=self.index_name,
14
+ dimension=1024, # Ensure this matches your embedding model's output dimension
15
+ metric="cosine" # Choose the appropriate metric (e.g., cosine, euclidean)
16
+ )
17
+ self.index = pinecone.Index(self.index_name)
18
+
19
+ def upsert(self, id, vector, metadata):
20
+ self.index.upsert([(id, vector.tolist(), metadata)])
21
+
22
+ def query(self, vector, filter_key, top_k):
23
+ return [
24
+ m["metadata"] for m in self.index.query(
25
+ vector.tolist(),
26
+ top_k=top_k,
27
+ include_metadata=True
28
+ )["matches"] if filter_key in m["id"]
29
+ ]
src/pipeline.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.segmenter import detect_event_segments
2
+ from src.transcriber import transcribe_video
3
+ from src.event_card import parse_game_card
4
+ from src.labeler import TogetherLLMLabeler
5
+ from src.embedder import InternVLEmbedder
6
+ from src.pinecone_store import PineconeStore
7
+ from src.utils import (
8
+ extract_key_frames, save_frames_locally,
9
+ generate_frame_urls, match_transcript_to_events,
10
+ clip_video_segment
11
+ )
12
+
13
+ labeler = TogetherLLMLabeler()
14
+ embedder = InternVLEmbedder()
15
+ pinecone = PineconeStore()
16
+
17
+ def run_pipeline(video_path, game_card_str):
18
+ game_card = parse_game_card(game_card_str)
19
+ transcript = transcribe_video(video_path)
20
+ events = detect_event_segments(video_path)
21
+
22
+ matched_events = match_transcript_to_events(events, transcript)
23
+
24
+ results = []
25
+
26
+ for idx, event in enumerate(matched_events):
27
+ event_id = f"event-{idx}"
28
+
29
+ frames = extract_key_frames(video_path, event['start_sec'], event['end_sec'])
30
+ frame_paths = save_frames_locally(frames, event_id)
31
+ frame_urls = generate_frame_urls(frame_paths)
32
+
33
+ label = labeler.generate_label(
34
+ game_card=game_card,
35
+ transcript=event['transcript'],
36
+ spatial_context=event['frames'],
37
+ frame_urls=frame_urls
38
+ )
39
+
40
+ clip_path = clip_video_segment(video_path, event['start_sec'], event['end_sec'], event_id)
41
+
42
+ video_vector = embedder.embed_video(clip_path)
43
+ text_vector = embedder.embed_text(label)
44
+
45
+ metadata = {
46
+ "start_sec": event['start_sec'],
47
+ "end_sec": event['end_sec'],
48
+ "label": label
49
+ }
50
+
51
+ pinecone.upsert(f"{event_id}-video", video_vector, metadata)
52
+ pinecone.upsert(f"{event_id}-text", text_vector, metadata)
53
+
54
+ results.append(metadata)
55
+
56
+ return {"events": results}
57
+
58
+ def search_highlights(query, top_k=5):
59
+ query_vector = embedder.embed_text(query)
60
+ results = pinecone.query(query_vector, filter_key="text", top_k=top_k)
61
+ return [
62
+ f"{r['label']} ({r['start_sec']}s - {r['end_sec']}s)" for r in results
63
+ ]
src/segmenter.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ from roboflow import Roboflow
4
+
5
+ ## When the ball is no longer detected, we start a new segment
6
+
7
+ def detect_event_segments(video_path, confidence=0.4):
8
+ rf = Roboflow(api_key=os.getenv("ROBOFLOW_API_KEY"))
9
+ project = rf.workspace().project("soccer-event-detection")
10
+ model = project.version(1).model
11
+
12
+ cap = cv2.VideoCapture(video_path)
13
+ fps = cap.get(cv2.CAP_PROP_FPS)
14
+
15
+ events = []
16
+ active_event = None
17
+ frame_data = []
18
+
19
+ while cap.isOpened():
20
+ ret, frame = cap.read()
21
+ if not ret:
22
+ break
23
+
24
+ frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
25
+ detections = model.predict(frame, confidence=confidence).json().get('predictions', [])
26
+ frame_data.append({"frame": frame_number, "objects": detections})
27
+
28
+ ball_detected = any(obj['class'] == 'ball' for obj in detections)
29
+ goal_area_activity = any(obj['class'] == 'goal' for obj in detections) and ball_detected
30
+
31
+ if goal_area_activity and active_event is None:
32
+ active_event = {"start_frame": frame_number, "frames": []}
33
+
34
+ if active_event:
35
+ active_event["frames"].append(frame_data[-1])
36
+
37
+ if active_event and not ball_detected:
38
+ active_event["end_frame"] = frame_number
39
+ events.append(active_event)
40
+ active_event = None
41
+
42
+ cap.release()
43
+
44
+ # Convert frames to timestamps
45
+ for event in events:
46
+ event['start_sec'] = event['start_frame'] / fps
47
+ event['end_sec'] = event['end_frame'] / fps
48
+
49
+ return events
src/transcriber.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import whisper
2
+
3
+ def transcribe_video(video_path, model_size="large"):
4
+ model = whisper.load_model(model_size)
5
+ result = model.transcribe(video_path)
6
+ return [{"start_sec": seg["start"], "end_sec": seg["end"], "text": seg["text"]} for seg in result["segments"]]
src/utils.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+
4
+ def extract_key_frames(video_path, start_sec, end_sec):
5
+ cap = cv2.VideoCapture(video_path)
6
+ fps = cap.get(cv2.CAP_PROP_FPS)
7
+
8
+ start_frame = int(start_sec * fps)
9
+ end_frame = int(end_sec * fps)
10
+ mid_frame = (start_frame + end_frame) // 2
11
+
12
+ frames = []
13
+ for frame_number in [start_frame, mid_frame, end_frame]:
14
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
15
+ ret, frame = cap.read()
16
+ if ret:
17
+ frames.append(frame)
18
+
19
+ cap.release()
20
+ return frames
21
+
22
+ def save_frames_locally(frames, event_id):
23
+ os.makedirs("/data", exist_ok=True)
24
+ frame_paths = []
25
+ for idx, frame in enumerate(frames):
26
+ path = f"/data/frame_{event_id}_{idx}.jpg"
27
+ cv2.imwrite(path, frame)
28
+ frame_paths.append(path)
29
+ return frame_paths
30
+
31
+ def generate_frame_urls(frame_paths):
32
+ base_url = os.getenv("SPACE_URL", "http://localhost:8000")
33
+ return [f"{base_url}/data/{os.path.basename(path)}" for path in frame_paths]
34
+
35
+ def match_transcript_to_events(events, transcript):
36
+ for event in events:
37
+ matched_lines = [
38
+ line["text"] for line in transcript
39
+ if line["start_sec"] <= event["end_sec"] and line["end_sec"] >= event["start_sec"]
40
+ ]
41
+ event["transcript"] = "\n".join(matched_lines) or "(No matching commentary)"
42
+ return events
43
+
44
+ def clip_video_segment(video_path, start_sec, end_sec, event_id):
45
+ output_path = f"/data/clip_{event_id}.mp4"
46
+ duration = end_sec - start_sec
47
+
48
+ command = [
49
+ "ffmpeg", "-y",
50
+ "-ss", str(start_sec),
51
+ "-i", video_path,
52
+ "-t", str(duration),
53
+ "-c", "copy", output_path
54
+ ]
55
+ os.system(" ".join(command))
56
+ return output_path