Adding skeleton highlights creator
Browse files- README.md +9 -0
- app.py +27 -0
- data/README.md +1 -0
- requirements.txt +12 -0
- src/embedder.py +28 -0
- src/event_card.py +7 -0
- src/labeler.py +35 -0
- src/pinecone_store.py +29 -0
- src/pipeline.py +63 -0
- src/segmenter.py +49 -0
- src/transcriber.py +6 -0
- src/utils.py +56 -0
README.md
CHANGED
@@ -10,4 +10,13 @@ pinned: false
|
|
10 |
short_description: Soccer Word-Based-Search Football Highlights Generator
|
11 |
---
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
10 |
short_description: Soccer Word-Based-Search Football Highlights Generator
|
11 |
---
|
12 |
|
13 |
+
This is the main app module for a word-search-based football highlights extractor.
|
14 |
+
|
15 |
+
This starts out with a gradio web app to upload a video.
|
16 |
+
The video then gets labelled using a variety of input + a labeller LLM hosted on TogetherAI.
|
17 |
+
Then the app calls a model space hosted on HuggingFace and embeds the video and the labels with InternV
|
18 |
+
Then through the gradio web app UI search, we can query the mf.
|
19 |
+
|
20 |
+
I get by with a little help from my friends
|
21 |
+
|
22 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
from src.pipeline import run_pipeline, search_highlights
|
4 |
+
|
5 |
+
def extract(video_file, game_card_str):
|
6 |
+
video_file.save("uploaded.mp4")
|
7 |
+
result = run_pipeline("uploaded.mp4", game_card_str)
|
8 |
+
return json.dumps(result, indent=2)
|
9 |
+
|
10 |
+
def search(query):
|
11 |
+
return "\n".join(search_highlights(query))
|
12 |
+
|
13 |
+
with gr.Blocks() as demo:
|
14 |
+
gr.Markdown("# Soccer Highlight Extractor")
|
15 |
+
|
16 |
+
with gr.Tab("Extract Highlights"):
|
17 |
+
video = gr.File(label="Upload Video")
|
18 |
+
game_card = gr.Textbox(label="Paste Game Card (JSON)", lines=10)
|
19 |
+
result = gr.Textbox(label="Pipeline Output")
|
20 |
+
gr.Button("Run Extraction").click(extract, [video, game_card], result)
|
21 |
+
|
22 |
+
with gr.Tab("Search Highlights"):
|
23 |
+
query = gr.Textbox(label="Search Query")
|
24 |
+
output = gr.Textbox(label="Search Results")
|
25 |
+
gr.Button("Search").click(search, query, output)
|
26 |
+
|
27 |
+
demo.launch()
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Placeholder to save folder
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
whisper
|
3 |
+
roboflow
|
4 |
+
requests
|
5 |
+
pinecone-client
|
6 |
+
ffmpeg-python
|
7 |
+
transformers
|
8 |
+
torch
|
9 |
+
decord
|
10 |
+
numpy
|
11 |
+
python-dotenv
|
12 |
+
together
|
src/embedder.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from transformers import AutoProcessor, AutoModel
|
4 |
+
import decord
|
5 |
+
|
6 |
+
class InternVLEmbedder:
|
7 |
+
def __init__(self):
|
8 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
self.model = AutoModel.from_pretrained("OpenGVLab/InternVL2_5-8B-MPO").to(self.device)
|
10 |
+
self.processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL2_5-8B-MPO")
|
11 |
+
|
12 |
+
def embed_video(self, video_path):
|
13 |
+
vr = decord.VideoReader(video_path)
|
14 |
+
frames = np.stack([vr[i].asnumpy() for i in np.linspace(0, len(vr)-1, 8).astype(int)])
|
15 |
+
tensor = torch.tensor(frames).permute(0, 3, 1, 2).unsqueeze(0).to(self.device)
|
16 |
+
|
17 |
+
with torch.no_grad():
|
18 |
+
video_vector = self.model.get_video_features(tensor).squeeze(0).cpu().numpy()
|
19 |
+
|
20 |
+
return video_vector / np.linalg.norm(video_vector)
|
21 |
+
|
22 |
+
def embed_text(self, text):
|
23 |
+
inputs = self.processor(text=[text], return_tensors="pt").to(self.device)
|
24 |
+
|
25 |
+
with torch.no_grad():
|
26 |
+
text_vector = self.model.get_text_features(**inputs).squeeze(0).cpu().numpy()
|
27 |
+
|
28 |
+
return text_vector / np.linalg.norm(text_vector)
|
src/event_card.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
def parse_game_card(game_card_str):
|
4 |
+
try:
|
5 |
+
return json.loads(game_card_str)
|
6 |
+
except json.JSONDecodeError:
|
7 |
+
return {"description": game_card_str}
|
src/labeler.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from together import Together
|
3 |
+
|
4 |
+
class TogetherLLMLabeler:
|
5 |
+
def __init__(self):
|
6 |
+
self.client = Together(api_key=os.getenv("TOGETHER_API_KEY"))
|
7 |
+
|
8 |
+
def generate_label(self, game_card, transcript, spatial_context, frame_urls):
|
9 |
+
prompt = f"""
|
10 |
+
Game Information:
|
11 |
+
{game_card}
|
12 |
+
|
13 |
+
Commentary:
|
14 |
+
{transcript}
|
15 |
+
|
16 |
+
Spatial Context (object detections per frame):
|
17 |
+
{spatial_context}
|
18 |
+
|
19 |
+
Instructions:
|
20 |
+
- Summarize this event in factual soccer terminology.
|
21 |
+
- Focus on the play's significance to the score.
|
22 |
+
- Avoid exaggeration.
|
23 |
+
"""
|
24 |
+
|
25 |
+
images = [{"type": "image_url", "image_url": {"url": url}} for url in frame_urls]
|
26 |
+
|
27 |
+
content = [{"type": "text", "text": prompt}] + images
|
28 |
+
|
29 |
+
response = self.client.chat_completions.create(
|
30 |
+
model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
31 |
+
messages=[{"role": "user", "content": content}],
|
32 |
+
max_tokens=200
|
33 |
+
)
|
34 |
+
|
35 |
+
return response.choices[0].message["content"].strip()
|
src/pinecone_store.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pinecone
|
2 |
+
import os
|
3 |
+
|
4 |
+
class PineconeStore:
|
5 |
+
def __init__(self):
|
6 |
+
api_key = os.getenv("PINECONE_API_KEY")
|
7 |
+
environment = os.getenv("PINECONE_ENV")
|
8 |
+
pinecone.init(api_key=api_key, environment=environment)
|
9 |
+
|
10 |
+
self.index_name = "soccer-highlights"
|
11 |
+
if self.index_name not in pinecone.list_indexes():
|
12 |
+
pinecone.create_index(
|
13 |
+
name=self.index_name,
|
14 |
+
dimension=1024, # Ensure this matches your embedding model's output dimension
|
15 |
+
metric="cosine" # Choose the appropriate metric (e.g., cosine, euclidean)
|
16 |
+
)
|
17 |
+
self.index = pinecone.Index(self.index_name)
|
18 |
+
|
19 |
+
def upsert(self, id, vector, metadata):
|
20 |
+
self.index.upsert([(id, vector.tolist(), metadata)])
|
21 |
+
|
22 |
+
def query(self, vector, filter_key, top_k):
|
23 |
+
return [
|
24 |
+
m["metadata"] for m in self.index.query(
|
25 |
+
vector.tolist(),
|
26 |
+
top_k=top_k,
|
27 |
+
include_metadata=True
|
28 |
+
)["matches"] if filter_key in m["id"]
|
29 |
+
]
|
src/pipeline.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.segmenter import detect_event_segments
|
2 |
+
from src.transcriber import transcribe_video
|
3 |
+
from src.event_card import parse_game_card
|
4 |
+
from src.labeler import TogetherLLMLabeler
|
5 |
+
from src.embedder import InternVLEmbedder
|
6 |
+
from src.pinecone_store import PineconeStore
|
7 |
+
from src.utils import (
|
8 |
+
extract_key_frames, save_frames_locally,
|
9 |
+
generate_frame_urls, match_transcript_to_events,
|
10 |
+
clip_video_segment
|
11 |
+
)
|
12 |
+
|
13 |
+
labeler = TogetherLLMLabeler()
|
14 |
+
embedder = InternVLEmbedder()
|
15 |
+
pinecone = PineconeStore()
|
16 |
+
|
17 |
+
def run_pipeline(video_path, game_card_str):
|
18 |
+
game_card = parse_game_card(game_card_str)
|
19 |
+
transcript = transcribe_video(video_path)
|
20 |
+
events = detect_event_segments(video_path)
|
21 |
+
|
22 |
+
matched_events = match_transcript_to_events(events, transcript)
|
23 |
+
|
24 |
+
results = []
|
25 |
+
|
26 |
+
for idx, event in enumerate(matched_events):
|
27 |
+
event_id = f"event-{idx}"
|
28 |
+
|
29 |
+
frames = extract_key_frames(video_path, event['start_sec'], event['end_sec'])
|
30 |
+
frame_paths = save_frames_locally(frames, event_id)
|
31 |
+
frame_urls = generate_frame_urls(frame_paths)
|
32 |
+
|
33 |
+
label = labeler.generate_label(
|
34 |
+
game_card=game_card,
|
35 |
+
transcript=event['transcript'],
|
36 |
+
spatial_context=event['frames'],
|
37 |
+
frame_urls=frame_urls
|
38 |
+
)
|
39 |
+
|
40 |
+
clip_path = clip_video_segment(video_path, event['start_sec'], event['end_sec'], event_id)
|
41 |
+
|
42 |
+
video_vector = embedder.embed_video(clip_path)
|
43 |
+
text_vector = embedder.embed_text(label)
|
44 |
+
|
45 |
+
metadata = {
|
46 |
+
"start_sec": event['start_sec'],
|
47 |
+
"end_sec": event['end_sec'],
|
48 |
+
"label": label
|
49 |
+
}
|
50 |
+
|
51 |
+
pinecone.upsert(f"{event_id}-video", video_vector, metadata)
|
52 |
+
pinecone.upsert(f"{event_id}-text", text_vector, metadata)
|
53 |
+
|
54 |
+
results.append(metadata)
|
55 |
+
|
56 |
+
return {"events": results}
|
57 |
+
|
58 |
+
def search_highlights(query, top_k=5):
|
59 |
+
query_vector = embedder.embed_text(query)
|
60 |
+
results = pinecone.query(query_vector, filter_key="text", top_k=top_k)
|
61 |
+
return [
|
62 |
+
f"{r['label']} ({r['start_sec']}s - {r['end_sec']}s)" for r in results
|
63 |
+
]
|
src/segmenter.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import os
|
3 |
+
from roboflow import Roboflow
|
4 |
+
|
5 |
+
## When the ball is no longer detected, we start a new segment
|
6 |
+
|
7 |
+
def detect_event_segments(video_path, confidence=0.4):
|
8 |
+
rf = Roboflow(api_key=os.getenv("ROBOFLOW_API_KEY"))
|
9 |
+
project = rf.workspace().project("soccer-event-detection")
|
10 |
+
model = project.version(1).model
|
11 |
+
|
12 |
+
cap = cv2.VideoCapture(video_path)
|
13 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
14 |
+
|
15 |
+
events = []
|
16 |
+
active_event = None
|
17 |
+
frame_data = []
|
18 |
+
|
19 |
+
while cap.isOpened():
|
20 |
+
ret, frame = cap.read()
|
21 |
+
if not ret:
|
22 |
+
break
|
23 |
+
|
24 |
+
frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
|
25 |
+
detections = model.predict(frame, confidence=confidence).json().get('predictions', [])
|
26 |
+
frame_data.append({"frame": frame_number, "objects": detections})
|
27 |
+
|
28 |
+
ball_detected = any(obj['class'] == 'ball' for obj in detections)
|
29 |
+
goal_area_activity = any(obj['class'] == 'goal' for obj in detections) and ball_detected
|
30 |
+
|
31 |
+
if goal_area_activity and active_event is None:
|
32 |
+
active_event = {"start_frame": frame_number, "frames": []}
|
33 |
+
|
34 |
+
if active_event:
|
35 |
+
active_event["frames"].append(frame_data[-1])
|
36 |
+
|
37 |
+
if active_event and not ball_detected:
|
38 |
+
active_event["end_frame"] = frame_number
|
39 |
+
events.append(active_event)
|
40 |
+
active_event = None
|
41 |
+
|
42 |
+
cap.release()
|
43 |
+
|
44 |
+
# Convert frames to timestamps
|
45 |
+
for event in events:
|
46 |
+
event['start_sec'] = event['start_frame'] / fps
|
47 |
+
event['end_sec'] = event['end_frame'] / fps
|
48 |
+
|
49 |
+
return events
|
src/transcriber.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
|
3 |
+
def transcribe_video(video_path, model_size="large"):
|
4 |
+
model = whisper.load_model(model_size)
|
5 |
+
result = model.transcribe(video_path)
|
6 |
+
return [{"start_sec": seg["start"], "end_sec": seg["end"], "text": seg["text"]} for seg in result["segments"]]
|
src/utils.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import os
|
3 |
+
|
4 |
+
def extract_key_frames(video_path, start_sec, end_sec):
|
5 |
+
cap = cv2.VideoCapture(video_path)
|
6 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
7 |
+
|
8 |
+
start_frame = int(start_sec * fps)
|
9 |
+
end_frame = int(end_sec * fps)
|
10 |
+
mid_frame = (start_frame + end_frame) // 2
|
11 |
+
|
12 |
+
frames = []
|
13 |
+
for frame_number in [start_frame, mid_frame, end_frame]:
|
14 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
|
15 |
+
ret, frame = cap.read()
|
16 |
+
if ret:
|
17 |
+
frames.append(frame)
|
18 |
+
|
19 |
+
cap.release()
|
20 |
+
return frames
|
21 |
+
|
22 |
+
def save_frames_locally(frames, event_id):
|
23 |
+
os.makedirs("/data", exist_ok=True)
|
24 |
+
frame_paths = []
|
25 |
+
for idx, frame in enumerate(frames):
|
26 |
+
path = f"/data/frame_{event_id}_{idx}.jpg"
|
27 |
+
cv2.imwrite(path, frame)
|
28 |
+
frame_paths.append(path)
|
29 |
+
return frame_paths
|
30 |
+
|
31 |
+
def generate_frame_urls(frame_paths):
|
32 |
+
base_url = os.getenv("SPACE_URL", "http://localhost:8000")
|
33 |
+
return [f"{base_url}/data/{os.path.basename(path)}" for path in frame_paths]
|
34 |
+
|
35 |
+
def match_transcript_to_events(events, transcript):
|
36 |
+
for event in events:
|
37 |
+
matched_lines = [
|
38 |
+
line["text"] for line in transcript
|
39 |
+
if line["start_sec"] <= event["end_sec"] and line["end_sec"] >= event["start_sec"]
|
40 |
+
]
|
41 |
+
event["transcript"] = "\n".join(matched_lines) or "(No matching commentary)"
|
42 |
+
return events
|
43 |
+
|
44 |
+
def clip_video_segment(video_path, start_sec, end_sec, event_id):
|
45 |
+
output_path = f"/data/clip_{event_id}.mp4"
|
46 |
+
duration = end_sec - start_sec
|
47 |
+
|
48 |
+
command = [
|
49 |
+
"ffmpeg", "-y",
|
50 |
+
"-ss", str(start_sec),
|
51 |
+
"-i", video_path,
|
52 |
+
"-t", str(duration),
|
53 |
+
"-c", "copy", output_path
|
54 |
+
]
|
55 |
+
os.system(" ".join(command))
|
56 |
+
return output_path
|