added possibility for video processing via link
Browse files- README.md +76 -123
- analyzer.py +9 -0
- detector.py +129 -0
- downloader_manager.py +50 -0
- flie_processor.py +23 -0
- handler.py +23 -11
- imgcomparison.py +106 -0
- mediaoutput.py +196 -0
- requirements.txt +0 -0
- slides.py +177 -0
- sorter.py +109 -0
- sources.py +32 -0
- timeline.py +166 -0
README.md
CHANGED
@@ -1,140 +1,93 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
- es
|
7 |
-
- ru
|
8 |
-
- ko
|
9 |
-
- fr
|
10 |
-
- ja
|
11 |
-
- pt
|
12 |
-
- tr
|
13 |
-
- pl
|
14 |
-
- ca
|
15 |
-
- nl
|
16 |
-
- ar
|
17 |
-
- sv
|
18 |
-
- it
|
19 |
-
- id
|
20 |
-
- hi
|
21 |
-
- fi
|
22 |
-
- vi
|
23 |
-
- he
|
24 |
-
- uk
|
25 |
-
- el
|
26 |
-
- ms
|
27 |
-
- cs
|
28 |
-
- ro
|
29 |
-
- da
|
30 |
-
- hu
|
31 |
-
- ta
|
32 |
-
- 'no'
|
33 |
-
- th
|
34 |
-
- ur
|
35 |
-
- hr
|
36 |
-
- bg
|
37 |
-
- lt
|
38 |
-
- la
|
39 |
-
- mi
|
40 |
-
- ml
|
41 |
-
- cy
|
42 |
-
- sk
|
43 |
-
- te
|
44 |
-
- fa
|
45 |
-
- lv
|
46 |
-
- bn
|
47 |
-
- sr
|
48 |
-
- az
|
49 |
-
- sl
|
50 |
-
- kn
|
51 |
-
- et
|
52 |
-
- mk
|
53 |
-
- br
|
54 |
-
- eu
|
55 |
-
- is
|
56 |
-
- hy
|
57 |
-
- ne
|
58 |
-
- mn
|
59 |
-
- bs
|
60 |
-
- kk
|
61 |
-
- sq
|
62 |
-
- sw
|
63 |
-
- gl
|
64 |
-
- mr
|
65 |
-
- pa
|
66 |
-
- si
|
67 |
-
- km
|
68 |
-
- sn
|
69 |
-
- yo
|
70 |
-
- so
|
71 |
-
- af
|
72 |
-
- oc
|
73 |
-
- ka
|
74 |
-
- be
|
75 |
-
- tg
|
76 |
-
- sd
|
77 |
-
- gu
|
78 |
-
- am
|
79 |
-
- yi
|
80 |
-
- lo
|
81 |
-
- uz
|
82 |
-
- fo
|
83 |
-
- ht
|
84 |
-
- ps
|
85 |
-
- tk
|
86 |
-
- nn
|
87 |
-
- mt
|
88 |
-
- sa
|
89 |
-
- lb
|
90 |
-
- my
|
91 |
-
- bo
|
92 |
-
- tl
|
93 |
-
- mg
|
94 |
-
- as
|
95 |
-
- tt
|
96 |
-
- haw
|
97 |
-
- ln
|
98 |
-
- ha
|
99 |
-
- ba
|
100 |
-
- jw
|
101 |
-
- su
|
102 |
-
tags:
|
103 |
-
- audio
|
104 |
-
- automatic-speech-recognition
|
105 |
-
license: mit
|
106 |
-
library_name: ctranslate2
|
107 |
-
---
|
108 |
|
109 |
-
|
110 |
|
111 |
-
|
|
|
|
|
|
|
112 |
|
113 |
-
|
114 |
|
115 |
-
|
|
|
|
|
|
|
116 |
|
117 |
```python
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
|
|
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
```
|
126 |
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
-
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
```
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
134 |
```
|
135 |
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
-
**For more information about the original model, see its [model card](https://huggingface.co/openai/whisper-large-v2).**
|
|
|
1 |
+
# Advanced Speech Processing with faster-whisper
|
2 |
+
|
3 |
+
Welcome to the advanced speech processing utility leveraging the powerful Whisper large-v2 model for the CTranslate2
|
4 |
+
framework. This tool is designed for high-performance speech recognition and processing, supporting a wide array of
|
5 |
+
languages and the capability to handle video inputs for slide detection and audio transcription.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
## Features
|
8 |
|
9 |
+
- **Language Support**: Extensive language support covering major global languages for speech recognition tasks.
|
10 |
+
- **Video Processing**: Download MP4 files from links and extract audio content for transcription.
|
11 |
+
- **Slide Detection**: Detect and sort presentation slides from video lectures or meetings.
|
12 |
+
- **Audio Transcription**: Leverage the Whisper large-v2 model to transcribe audio content with high accuracy.
|
13 |
|
14 |
+
## Getting Started
|
15 |
|
16 |
+
To begin using this utility, set up the `WhisperModel` from the `faster_whisper` package with the provided language
|
17 |
+
configurations. The `EndpointHandler` class is your main interface for processing the data.
|
18 |
+
|
19 |
+
### Example Usage
|
20 |
|
21 |
```python
|
22 |
+
import requests
|
23 |
+
import os
|
24 |
+
|
25 |
+
# Sample data dict with the link to the video file and the desired language for transcription
|
26 |
+
DATA = {
|
27 |
+
"inputs": "<base64_encoded_audio_string>",
|
28 |
+
"link": "<your_mp4_video_link>",
|
29 |
+
"language": "en", # Choose from supported languages
|
30 |
+
"task": "transcribe",
|
31 |
+
"type": "audio" # Use "link" for video files
|
32 |
+
}
|
33 |
|
34 |
+
HF_ACCESS_TOKEN = os.environ.get("HF_TRANSCRIPTION_ACCESS_TOKEN")
|
35 |
+
API_URL = os.environ.get("HF_TRANSCRIPTION_ENDPOINT")
|
36 |
|
37 |
+
HEADERS = {
|
38 |
+
"Authorization": HF_ACCESS_TOKEN,
|
39 |
+
"Content-Type": "application/json"
|
40 |
+
}
|
41 |
+
|
42 |
+
response = requests.post(API_URL, headers=HEADERS, json=DATA)
|
43 |
+
print(response)
|
44 |
+
|
45 |
+
# The response will contain transcribed audio and detected slides if a video link was provided
|
46 |
```
|
47 |
|
48 |
+
### Processing Video Files
|
49 |
+
|
50 |
+
To process video files, the `process_video` function downloads the MP4 file, extracts the audio, and passes it to the
|
51 |
+
Whisper model for transcription. It also utilizes the `Detector` and `SlideSorter` classes to identify and sort
|
52 |
+
presentation slides within the video.
|
53 |
+
|
54 |
+
### Error Handling
|
55 |
+
|
56 |
+
Comprehensive logging and error handling are in place to ensure you're informed of each step's success or failure.
|
57 |
|
58 |
+
## Installation
|
59 |
|
60 |
+
Ensure that you have the following dependencies installed:
|
61 |
+
|
62 |
+
```plaintext
|
63 |
+
opencv-python~=4.8.1.78
|
64 |
+
numpy~=1.26.1
|
65 |
+
Pillow~=10.0.1
|
66 |
+
tqdm~=4.66.1
|
67 |
+
requests~=2.31.0
|
68 |
+
moviepy~=1.0.3
|
69 |
+
scipy~=1.11.3
|
70 |
```
|
71 |
+
|
72 |
+
Install them using pip with the provided `requirements.txt` file:
|
73 |
+
|
74 |
+
```bash
|
75 |
+
pip install -r requirements.txt
|
76 |
```
|
77 |
|
78 |
+
## Languages Supported
|
79 |
+
|
80 |
+
This tool supports a plethora of languages, making it highly versatile for global applications. The full list of
|
81 |
+
supported languages can be found in the `language` section of the old README.
|
82 |
+
|
83 |
+
## License
|
84 |
|
85 |
+
This project is available under the MIT license.
|
86 |
+
|
87 |
+
## More Information
|
88 |
+
|
89 |
+
For more information about the original Whisper large-v2 model, please refer to
|
90 |
+
its [model card on Hugging Face](https://huggingface.co/openai/whisper-large-v2).
|
91 |
+
|
92 |
+
---
|
93 |
|
|
analyzer.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABCMeta, abstractmethod
|
2 |
+
|
3 |
+
|
4 |
+
class Analyzer(object):
|
5 |
+
__metaclass__ = ABCMeta
|
6 |
+
|
7 |
+
@abstractmethod
|
8 |
+
def analyze(self):
|
9 |
+
pass
|
detector.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import cProfile
|
5 |
+
import pstats
|
6 |
+
|
7 |
+
import cv2
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
import imgcomparison
|
11 |
+
import mediaoutput
|
12 |
+
import timeline
|
13 |
+
from analyzer import Analyzer
|
14 |
+
from slides import Slide
|
15 |
+
|
16 |
+
|
17 |
+
class InfiniteCounter(object):
|
18 |
+
"""
|
19 |
+
InfiniteCounter is a class that represents a counter that will
|
20 |
+
return the next number indefinitely. When the user calls count()
|
21 |
+
return the current number. Then it will increment the current
|
22 |
+
number by the specified steps.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, start=0, step=1):
|
26 |
+
"""
|
27 |
+
Default Initializer
|
28 |
+
:param start: the starting value of the counter
|
29 |
+
:param step: the amount that should be added at each step
|
30 |
+
"""
|
31 |
+
self.current = start
|
32 |
+
self.step = step
|
33 |
+
|
34 |
+
def increment(self):
|
35 |
+
self.current += self.step
|
36 |
+
|
37 |
+
def count(self):
|
38 |
+
"""
|
39 |
+
The count method yields the current number and then
|
40 |
+
increments the current number by the specified step in the
|
41 |
+
default initializer
|
42 |
+
:return: the successor from the previous number
|
43 |
+
"""
|
44 |
+
while True:
|
45 |
+
yield self.current
|
46 |
+
self.current += self.step
|
47 |
+
|
48 |
+
|
49 |
+
class Detector(Analyzer):
|
50 |
+
|
51 |
+
def __init__(self, device, outpath=None, fileformat=".png"):
|
52 |
+
cap = cv2.VideoCapture(sanitize_device(device))
|
53 |
+
self.sequence = timeline.Timeline(cap)
|
54 |
+
self.writer = mediaoutput.NullWriter()
|
55 |
+
if outpath is not None:
|
56 |
+
self.writer = mediaoutput.TimestampImageWriter(self.sequence.fps, outpath, fileformat)
|
57 |
+
self.comparator = imgcomparison.AbsDiffHistComparator(0.97)
|
58 |
+
|
59 |
+
def detect_slides(self):
|
60 |
+
frames = []
|
61 |
+
name_getter = mediaoutput.TimestampImageWriter(self.sequence.fps)
|
62 |
+
with tqdm(total=self.sequence.len, desc='Detecting Slides: ') as pbar:
|
63 |
+
for i, frame in self.check_transition():
|
64 |
+
if frame is not None:
|
65 |
+
frames.append(Slide(name_getter.next_name([i]), frame))
|
66 |
+
pbar.update(1)
|
67 |
+
|
68 |
+
self.sequence.release_stream()
|
69 |
+
return frames
|
70 |
+
|
71 |
+
def check_transition(self):
|
72 |
+
prev_frame = self.sequence.next_frame()
|
73 |
+
self.writer.write(prev_frame, 0)
|
74 |
+
yield 0, prev_frame
|
75 |
+
|
76 |
+
frame_counter = InfiniteCounter()
|
77 |
+
for frame_count in frame_counter.count():
|
78 |
+
|
79 |
+
frame = self.sequence.next_frame()
|
80 |
+
|
81 |
+
if frame is None:
|
82 |
+
break
|
83 |
+
elif not self.comparator.are_same(prev_frame, frame):
|
84 |
+
|
85 |
+
while True:
|
86 |
+
if self.comparator.are_same(prev_frame, frame):
|
87 |
+
break
|
88 |
+
prev_frame = frame
|
89 |
+
frame = self.sequence.next_frame()
|
90 |
+
frame_counter.increment()
|
91 |
+
self.writer.write(frame, frame_count)
|
92 |
+
yield frame_count, frame
|
93 |
+
|
94 |
+
prev_frame = frame
|
95 |
+
|
96 |
+
yield frame_count, None
|
97 |
+
|
98 |
+
def analyze(self):
|
99 |
+
for i, frame in self.check_transition():
|
100 |
+
time = mediaoutput.TimestampImageWriter(self.sequence.fps).next_name([i])
|
101 |
+
yield Slide(time, frame)
|
102 |
+
|
103 |
+
|
104 |
+
def sanitize_device(device):
|
105 |
+
"""returns device id if device can be converted to an integer"""
|
106 |
+
try:
|
107 |
+
return int(device)
|
108 |
+
except (TypeError, ValueError):
|
109 |
+
return device
|
110 |
+
|
111 |
+
|
112 |
+
if __name__ == "__main__":
|
113 |
+
Parser = argparse.ArgumentParser(description="Slide Detector")
|
114 |
+
Parser.add_argument("-d", "--device", help="video device number or path to video file")
|
115 |
+
Parser.add_argument("-o", "--outpath", help="path to output video file", default="slides/", nargs='?')
|
116 |
+
Parser.add_argument("-f", "--fileformat", help="file format of the output images e.g. '.jpg'",
|
117 |
+
default=".jpg", nargs='?')
|
118 |
+
Args = Parser.parse_args()
|
119 |
+
|
120 |
+
|
121 |
+
def run():
|
122 |
+
detector = Detector(Args.device, Args.outpath, Args.fileformat)
|
123 |
+
detector.detect_slides()
|
124 |
+
|
125 |
+
|
126 |
+
cProfile.run('run()', 'profiling_stats.prof')
|
127 |
+
|
128 |
+
p = pstats.Stats('profiling_stats.prof')
|
129 |
+
p.sort_stats('cumulative').print_stats(10)
|
downloader_manager.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
from io import BytesIO
|
5 |
+
|
6 |
+
import requests
|
7 |
+
from moviepy.editor import VideoFileClip
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
|
11 |
+
def download_mp4_and_extract_audio(link: str):
|
12 |
+
"""Download an MP4 file from a given link and return the video and audio content as bytes."""
|
13 |
+
logging.info("Starting the download of the MP4 file...")
|
14 |
+
try:
|
15 |
+
r = requests.get(link, stream=True)
|
16 |
+
r.raise_for_status()
|
17 |
+
|
18 |
+
total_size = int(r.headers.get('content-length', 0))
|
19 |
+
video_content = BytesIO()
|
20 |
+
|
21 |
+
with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading...") as bar:
|
22 |
+
for data in r.iter_content(chunk_size=1024):
|
23 |
+
bar.update(len(data))
|
24 |
+
video_content.write(data)
|
25 |
+
|
26 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video_file:
|
27 |
+
temp_video_file.write(video_content.getvalue())
|
28 |
+
temp_video_file_path = temp_video_file.name
|
29 |
+
|
30 |
+
logging.info("Extracting audio from video...")
|
31 |
+
with VideoFileClip(temp_video_file_path) as video:
|
32 |
+
audio = video.audio
|
33 |
+
|
34 |
+
with tempfile.NamedTemporaryFile(suffix=".aac", delete=False) as temp_audio_file:
|
35 |
+
audio.write_audiofile(temp_audio_file.name, codec='aac')
|
36 |
+
temp_audio_file_path = temp_audio_file.name
|
37 |
+
|
38 |
+
with open(temp_audio_file_path, 'rb') as f:
|
39 |
+
audio_content = BytesIO(f.read())
|
40 |
+
|
41 |
+
os.remove(temp_video_file_path)
|
42 |
+
os.remove(temp_audio_file_path)
|
43 |
+
|
44 |
+
logging.info("Download and audio extraction completed")
|
45 |
+
return video_content.getvalue(), audio_content.getvalue()
|
46 |
+
except requests.exceptions.HTTPError as e:
|
47 |
+
logging.error(f"HTTP Error: {e}")
|
48 |
+
except Exception as e:
|
49 |
+
logging.error(f"Failed to download MP4 and extract audio: {e}")
|
50 |
+
return None, None
|
flie_processor.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import tempfile
|
3 |
+
|
4 |
+
import sources
|
5 |
+
from detector import Detector
|
6 |
+
from downloader_manager import download_mp4_and_extract_audio
|
7 |
+
from sorter import SlideSorter
|
8 |
+
|
9 |
+
|
10 |
+
def process_video(link):
|
11 |
+
try:
|
12 |
+
video_bytes, audio_bytes = download_mp4_and_extract_audio(link)
|
13 |
+
|
14 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video:
|
15 |
+
temp_video.write(video_bytes)
|
16 |
+
temp_video_path = temp_video.name
|
17 |
+
|
18 |
+
detector = Detector(temp_video_path)
|
19 |
+
sorter = SlideSorter(sources.ListSource(detector.detect_slides()), outpath="sorted_slides/")
|
20 |
+
slides = sorter.sort()
|
21 |
+
return slides, audio_bytes
|
22 |
+
except Exception as e:
|
23 |
+
logging.exception("Failed to execute sorter: %s", e)
|
handler.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
-
import io
|
2 |
import base64
|
3 |
-
|
4 |
import logging
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
class EndpointHandler:
|
@@ -11,21 +14,28 @@ class EndpointHandler:
|
|
11 |
self.model = WhisperModel("large-v2", num_workers=30)
|
12 |
|
13 |
def __call__(self, data: dict[str, str]):
|
14 |
-
|
15 |
-
|
|
|
16 |
language = data.pop("language", "de")
|
17 |
task = data.pop("task", "transcribe")
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# run inference pipeline
|
25 |
logging.info("Running inference...")
|
26 |
segments, info = self.model.transcribe(audio_bytes, language=language, task=task)
|
27 |
|
28 |
-
# postprocess the prediction
|
29 |
full_text = []
|
30 |
for segment in segments:
|
31 |
full_text.append({"segmentId": segment.id,
|
@@ -40,4 +50,6 @@ class EndpointHandler:
|
|
40 |
logging.info("segment " + str(segment.id) + " transcribed")
|
41 |
logging.info("Inference completed.")
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
1 |
import base64
|
2 |
+
import io
|
3 |
import logging
|
4 |
|
5 |
+
from faster_whisper import WhisperModel
|
6 |
+
|
7 |
+
from flie_processor import process_video
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
|
11 |
|
12 |
class EndpointHandler:
|
|
|
14 |
self.model = WhisperModel("large-v2", num_workers=30)
|
15 |
|
16 |
def __call__(self, data: dict[str, str]):
|
17 |
+
inputs = data.pop("inputs")
|
18 |
+
link = data.pop("link")
|
19 |
+
|
20 |
language = data.pop("language", "de")
|
21 |
task = data.pop("task", "transcribe")
|
22 |
+
processing_type = data.pop("type", "audio")
|
23 |
+
|
24 |
+
response = {}
|
25 |
|
26 |
+
if processing_type == "link":
|
27 |
+
slides, audio_bytes = process_video(link)
|
28 |
+
slides_list = [slide.to_dict() for slide in slides]
|
29 |
+
response.update({"slides": slides_list})
|
30 |
+
else:
|
31 |
+
audio_bytes_decoded = base64.b64decode(inputs)
|
32 |
+
logging.debug(f"Decoded Bytes Length: {len(audio_bytes_decoded)}")
|
33 |
+
audio_bytes = io.BytesIO(audio_bytes_decoded)
|
34 |
|
35 |
# run inference pipeline
|
36 |
logging.info("Running inference...")
|
37 |
segments, info = self.model.transcribe(audio_bytes, language=language, task=task)
|
38 |
|
|
|
39 |
full_text = []
|
40 |
for segment in segments:
|
41 |
full_text.append({"segmentId": segment.id,
|
|
|
50 |
logging.info("segment " + str(segment.id) + " transcribed")
|
51 |
logging.info("Inference completed.")
|
52 |
|
53 |
+
response.update({"audios": full_text})
|
54 |
+
logging.debug(response)
|
55 |
+
return response
|
imgcomparison.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
from abc import ABCMeta, abstractmethod
|
4 |
+
import operator
|
5 |
+
import scipy.spatial.distance as dist
|
6 |
+
|
7 |
+
|
8 |
+
class ImageComparator(object):
|
9 |
+
__metaclass__ = ABCMeta
|
10 |
+
|
11 |
+
@abstractmethod
|
12 |
+
def are_similar(self, first, second):
|
13 |
+
pass
|
14 |
+
|
15 |
+
def __init__(self, threshold):
|
16 |
+
self.threshold = threshold
|
17 |
+
|
18 |
+
def are_same(self, first, second, op=operator.ge):
|
19 |
+
return op(self.are_similar(first, second), self.threshold)
|
20 |
+
|
21 |
+
|
22 |
+
class AbsDiffHistComparator(ImageComparator):
|
23 |
+
|
24 |
+
def __init__(self, threshold):
|
25 |
+
super(AbsDiffHistComparator, self).__init__(threshold)
|
26 |
+
|
27 |
+
def are_similar(self, first, second):
|
28 |
+
res = cv2.absdiff(first, second)
|
29 |
+
hist = cv2.calcHist([res], [0], None, [256], [0, 256])
|
30 |
+
return 1 - np.sum(hist[15::]) / np.sum(hist)
|
31 |
+
|
32 |
+
|
33 |
+
class EuclideanComparator(ImageComparator):
|
34 |
+
|
35 |
+
def __init__(self, threshold):
|
36 |
+
super(EuclideanComparator, self).__init__(threshold)
|
37 |
+
|
38 |
+
def are_similar(self, first, second):
|
39 |
+
return dist.euclidean(first, second)
|
40 |
+
|
41 |
+
|
42 |
+
class ChebysevComparator(ImageComparator):
|
43 |
+
|
44 |
+
def __init__(self, threshold):
|
45 |
+
super(ChebysevComparator, self).__init__(threshold)
|
46 |
+
|
47 |
+
def are_similar(self, first, second):
|
48 |
+
return dist.chebyshev(first, second)
|
49 |
+
|
50 |
+
|
51 |
+
class OpenCVComparator(ImageComparator):
|
52 |
+
|
53 |
+
__metaclass__ = ABCMeta
|
54 |
+
|
55 |
+
def __init__(self, threshold):
|
56 |
+
super(OpenCVComparator, self).__init__(threshold)
|
57 |
+
|
58 |
+
@abstractmethod
|
59 |
+
def get_technique(self):
|
60 |
+
pass
|
61 |
+
|
62 |
+
def are_similar(self, first, second):
|
63 |
+
|
64 |
+
result = 0
|
65 |
+
for i in xrange(3):
|
66 |
+
hist1 = cv2.calcHist([first], [i], None, [256], [0,256])
|
67 |
+
hist2 = cv2.calcHist([second], [i], None, [256], [0,256])
|
68 |
+
result += cv2.compareHist(hist1, hist2, self.get_technique())
|
69 |
+
|
70 |
+
return result / 3
|
71 |
+
|
72 |
+
|
73 |
+
class CorrelationOpenCVComparator(OpenCVComparator):
|
74 |
+
|
75 |
+
def __init__(self, threshold):
|
76 |
+
super(CorrelationOpenCVComparator, self).__init__(threshold)
|
77 |
+
|
78 |
+
def get_technique(self):
|
79 |
+
return cv2.HISTCMP_CORREL
|
80 |
+
|
81 |
+
|
82 |
+
class ChiOpenCVComparator(OpenCVComparator):
|
83 |
+
|
84 |
+
def __init__(self, threshold):
|
85 |
+
super(ChiOpenCVComparator, self).__init__(threshold)
|
86 |
+
|
87 |
+
def get_technique(self):
|
88 |
+
return cv2.HISTCMP_CHISQR
|
89 |
+
|
90 |
+
|
91 |
+
class IntersectionOpenCVComparator(OpenCVComparator):
|
92 |
+
|
93 |
+
def __init__(self, threshold):
|
94 |
+
super(IntersectionOpenCVComparator, self).__init__(threshold)
|
95 |
+
|
96 |
+
def get_technique(self):
|
97 |
+
return cv2.HISTCMP_INTERSECT
|
98 |
+
|
99 |
+
|
100 |
+
class BhattacharyyaOpenCVComparator(OpenCVComparator):
|
101 |
+
|
102 |
+
def __init__(self, threshold):
|
103 |
+
super(BhattacharyyaOpenCVComparator, self).__init__(threshold)
|
104 |
+
|
105 |
+
def get_technique(self):
|
106 |
+
return cv2.HISTCMP_BHATTACHARYYA
|
mediaoutput.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABCMeta, abstractmethod
|
2 |
+
import datetime
|
3 |
+
import cv2
|
4 |
+
import math
|
5 |
+
import os
|
6 |
+
import errno
|
7 |
+
|
8 |
+
|
9 |
+
class MediaWriter(object):
|
10 |
+
"""
|
11 |
+
Abstract class for all media outputs. Forcing each inheritance
|
12 |
+
to have a write class.
|
13 |
+
"""
|
14 |
+
__metaclass__ = ABCMeta
|
15 |
+
|
16 |
+
@abstractmethod
|
17 |
+
def write(self, content, *args):
|
18 |
+
"""
|
19 |
+
Write method to write media to disk
|
20 |
+
:param media: the media to be written
|
21 |
+
:param args: additional arguments that may be helpful
|
22 |
+
"""
|
23 |
+
pass
|
24 |
+
|
25 |
+
|
26 |
+
class NullWriter(MediaWriter):
|
27 |
+
def write(self, content, *args):
|
28 |
+
pass
|
29 |
+
|
30 |
+
|
31 |
+
class ImageWriter(MediaWriter):
|
32 |
+
"""
|
33 |
+
The ImageWriter will write an image to disk.
|
34 |
+
"""
|
35 |
+
__metaclass__ = ABCMeta
|
36 |
+
|
37 |
+
def __init__(self, prefix, file_format):
|
38 |
+
"""
|
39 |
+
Default initializer
|
40 |
+
:param prefix: the filename prefix a counter will be added
|
41 |
+
after this string and incremented after each write to disk
|
42 |
+
:param file_format: the file format for the images.
|
43 |
+
"""
|
44 |
+
if not file_format.startswith('.'):
|
45 |
+
file_format = '.' + file_format
|
46 |
+
if prefix is not None:
|
47 |
+
setup_dirs(prefix)
|
48 |
+
self.name = prefix + file_format
|
49 |
+
|
50 |
+
def write(self, img, *args):
|
51 |
+
"""
|
52 |
+
Writes the given image to the location specified through the
|
53 |
+
initializer
|
54 |
+
:param img: the image that will be written to disk
|
55 |
+
"""
|
56 |
+
cv2.imwrite(self.name % self.next_name(args), img)
|
57 |
+
|
58 |
+
@abstractmethod
|
59 |
+
def next_name(self, *args):
|
60 |
+
"""
|
61 |
+
This abstract method returns the object that should be inserted
|
62 |
+
into the filename
|
63 |
+
:param args: the args, that is passed to write_image
|
64 |
+
:return: the object that will be inserted into the filename
|
65 |
+
"""
|
66 |
+
|
67 |
+
|
68 |
+
class CustomImageWriter(ImageWriter):
|
69 |
+
"""
|
70 |
+
Image Writer that uses a custom name. It takes it as the first
|
71 |
+
argument in *args in the write method.
|
72 |
+
"""
|
73 |
+
def __init__(self, prefix=None, file_format='.jpg'):
|
74 |
+
"""
|
75 |
+
Default initializer
|
76 |
+
:param prefix: the file location and file name prefix
|
77 |
+
:param file_format: the file format e.g. .jpg, .png
|
78 |
+
"""
|
79 |
+
super(CustomImageWriter, self).__init__(prefix + '%s', file_format)
|
80 |
+
|
81 |
+
def next_name(self, *args):
|
82 |
+
return args[0]
|
83 |
+
|
84 |
+
|
85 |
+
class IncrementalImageWriter(ImageWriter):
|
86 |
+
"""
|
87 |
+
The IncrementalImageWriter will write an image to disk and append a
|
88 |
+
number to the file name. This number will be auto-incremented by the
|
89 |
+
specified step size after each write.
|
90 |
+
"""
|
91 |
+
|
92 |
+
def __init__(self, prefix=None, file_format='.jpg', start=0, step=1):
|
93 |
+
"""
|
94 |
+
Default initializer
|
95 |
+
:param prefix: the file location and file name
|
96 |
+
:param file_format: the file format e.g. .jpg, .png
|
97 |
+
:param start: the starting number for the incremental count
|
98 |
+
:param step: the step by which the count should increment
|
99 |
+
"""
|
100 |
+
self.count = start - step
|
101 |
+
self.step = step
|
102 |
+
if prefix is not None:
|
103 |
+
prefix += '%d'
|
104 |
+
super(IncrementalImageWriter, self).__init__(prefix, file_format)
|
105 |
+
|
106 |
+
def next_name(self, *args):
|
107 |
+
self.count += self.step
|
108 |
+
return self.count
|
109 |
+
|
110 |
+
|
111 |
+
class TimestampImageWriter(ImageWriter):
|
112 |
+
"""
|
113 |
+
TimestampImageWriter is a ImageWriter that adds the timestamp of when
|
114 |
+
the image was first shown in the original stream
|
115 |
+
"""
|
116 |
+
|
117 |
+
def __init__(self, fps, prefix=None, file_format='.jpg'):
|
118 |
+
"""
|
119 |
+
Default initializer
|
120 |
+
:param fps: The number of frames per second in the original stream
|
121 |
+
:param prefix: the prefix of the path to the output location
|
122 |
+
:param file_format: the file format of the output image
|
123 |
+
"""
|
124 |
+
self.fps = fps
|
125 |
+
|
126 |
+
if prefix is not None:
|
127 |
+
prefix += '%s'
|
128 |
+
super(TimestampImageWriter, self).__init__(prefix, file_format)
|
129 |
+
|
130 |
+
def next_name(self, args):
|
131 |
+
current_frame = args[0]
|
132 |
+
seconds = current_frame / self.fps
|
133 |
+
milliseconds = seconds - math.floor(seconds)
|
134 |
+
if milliseconds == 0:
|
135 |
+
milliseconds = '000'
|
136 |
+
else:
|
137 |
+
milliseconds = str(int(milliseconds * (10 ** 3)))
|
138 |
+
return str(datetime.timedelta(seconds=int(seconds))) + '.' + milliseconds.zfill(3)
|
139 |
+
|
140 |
+
|
141 |
+
class TimetableWriter(MediaWriter):
|
142 |
+
"""
|
143 |
+
The Timetable Writer outputs each slide iteratively using
|
144 |
+
the IncrementalImageWriter. Additionally it outputs a ".txt"
|
145 |
+
document containing the slide name and their appearances.
|
146 |
+
"""
|
147 |
+
def __init__(self, output_dir, timetable_loc, file_format):
|
148 |
+
"""
|
149 |
+
Default initializer
|
150 |
+
:param output_dir: the output directory for the sorted slides
|
151 |
+
:param timetable_file: where the timetable file should be stored
|
152 |
+
"""
|
153 |
+
setup_dirs(timetable_loc)
|
154 |
+
self.timetable = open(timetable_loc, 'w')
|
155 |
+
self.img_writer = IncrementalImageWriter(prefix=output_dir, start=1, file_format=file_format)
|
156 |
+
self.txt_writer = TextWriter(self.timetable)
|
157 |
+
|
158 |
+
def write(self, slides, *args):
|
159 |
+
i = 1
|
160 |
+
for slide in slides:
|
161 |
+
if slide.marked:
|
162 |
+
continue
|
163 |
+
self.img_writer.write(slide.img)
|
164 |
+
appearances = slide.time
|
165 |
+
for com in slide.times:
|
166 |
+
appearances += " " + com
|
167 |
+
self.txt_writer.write("Slide %d: %s\n" % (i, appearances))
|
168 |
+
i += 1
|
169 |
+
|
170 |
+
|
171 |
+
def close(self):
|
172 |
+
self.timetable.close()
|
173 |
+
|
174 |
+
|
175 |
+
class TextWriter(MediaWriter):
|
176 |
+
def __init__(self, output_file):
|
177 |
+
self.output_file = output_file
|
178 |
+
|
179 |
+
def write(self, content, *args):
|
180 |
+
self.output_file.write(content)
|
181 |
+
|
182 |
+
def setup_dirs(path):
|
183 |
+
"""
|
184 |
+
Takes a path and makes sure that directories to the path
|
185 |
+
gets created and is writable.
|
186 |
+
:param filename: the path to file
|
187 |
+
"""
|
188 |
+
path = os.path.dirname(path)
|
189 |
+
if path == '':
|
190 |
+
return
|
191 |
+
if not os.path.exists(path):
|
192 |
+
try:
|
193 |
+
os.makedirs(path)
|
194 |
+
except OSError as exc: # Guard against race condition
|
195 |
+
if exc.errno != errno.EEXIST:
|
196 |
+
raise
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
slides.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import cv2
|
5 |
+
import numpy
|
6 |
+
import re
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
from PIL import Image
|
10 |
+
from abc import ABCMeta, abstractmethod
|
11 |
+
|
12 |
+
|
13 |
+
def numerical_sort(value):
|
14 |
+
numbers = re.compile(r'(\d+)')
|
15 |
+
parts = numbers.split(value)
|
16 |
+
parts[1::2] = map(int, parts[1::2])
|
17 |
+
return parts
|
18 |
+
|
19 |
+
|
20 |
+
class Slide(object):
|
21 |
+
"""
|
22 |
+
Represents a slide
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, time, img):
|
26 |
+
"""
|
27 |
+
Default initializer for a slide representation
|
28 |
+
:param time: the time when the slide appears
|
29 |
+
:param img: the image representing the slide
|
30 |
+
"""
|
31 |
+
self.time = time
|
32 |
+
self.img = img
|
33 |
+
self.marked = False
|
34 |
+
self.times = []
|
35 |
+
self.reference = None
|
36 |
+
self.page_number = 0
|
37 |
+
|
38 |
+
def add_time(self, time):
|
39 |
+
"""
|
40 |
+
Add an additional instance in time, when the slide
|
41 |
+
is displayed.
|
42 |
+
:param time: the time when the slide is displayed
|
43 |
+
"""
|
44 |
+
self.times.append(time)
|
45 |
+
|
46 |
+
def to_dict(self):
|
47 |
+
"""
|
48 |
+
Convert the Slide object to a dictionary, handling image serialization.
|
49 |
+
"""
|
50 |
+
# Convert the image to a bytes object and then encode it to a base64 string
|
51 |
+
_, buffer = cv2.imencode('.jpg', self.img)
|
52 |
+
img_encoded = base64.b64encode(buffer).decode('utf-8')
|
53 |
+
|
54 |
+
return {
|
55 |
+
'time': self.time,
|
56 |
+
'img': img_encoded, # Use the encoded image string
|
57 |
+
'marked': self.marked,
|
58 |
+
'times': self.times,
|
59 |
+
'reference': self.reference,
|
60 |
+
'page_number': self.page_number
|
61 |
+
}
|
62 |
+
|
63 |
+
@classmethod
|
64 |
+
def from_dict(cls, data):
|
65 |
+
"""
|
66 |
+
Create a Slide object from a dictionary, handling image deserialization.
|
67 |
+
"""
|
68 |
+
# Decode the image from a base64 string to a bytes object and then to a numpy array
|
69 |
+
img_decoded = base64.b64decode(data['img'])
|
70 |
+
img = cv2.imdecode(np.frombuffer(img_decoded, np.uint8), cv2.IMREAD_COLOR)
|
71 |
+
|
72 |
+
slide = cls(data['time'], img)
|
73 |
+
slide.marked = data.get('marked', False)
|
74 |
+
slide.times = data.get('times', [])
|
75 |
+
slide.reference = data.get('reference')
|
76 |
+
slide.page_number = data.get('page_number', 0)
|
77 |
+
return slide
|
78 |
+
|
79 |
+
def to_json(self):
|
80 |
+
"""
|
81 |
+
Convert the Slide object to a JSON string.
|
82 |
+
"""
|
83 |
+
return json.dumps(self.to_dict())
|
84 |
+
|
85 |
+
@classmethod
|
86 |
+
def from_json(cls, json_str):
|
87 |
+
"""
|
88 |
+
Create a Slide object from a JSON string.
|
89 |
+
"""
|
90 |
+
data = json.loads(json_str)
|
91 |
+
return cls.from_dict(data)
|
92 |
+
|
93 |
+
|
94 |
+
def slides_to_json(slides):
|
95 |
+
"""
|
96 |
+
Convert a list of Slide objects to a JSON string.
|
97 |
+
"""
|
98 |
+
return json.dumps([slide.to_dict() for slide in slides])
|
99 |
+
|
100 |
+
|
101 |
+
def slides_from_json(json_str):
|
102 |
+
"""
|
103 |
+
Create a list of Slide objects from a JSON string.
|
104 |
+
"""
|
105 |
+
slides_data = json.loads(json_str)
|
106 |
+
return [Slide.from_dict(slide_data) for slide_data in slides_data]
|
107 |
+
|
108 |
+
|
109 |
+
class SlideDataHelper(object):
|
110 |
+
"""
|
111 |
+
The helps to get slides from data.
|
112 |
+
"""
|
113 |
+
|
114 |
+
def __init__(self, path, image_type="opencv"):
|
115 |
+
"""
|
116 |
+
Default initializer
|
117 |
+
:param path: the path, where the slide is stored on disk
|
118 |
+
:image_type: the type representing the image. Either "opencv" or "pil" might be required for certain usage.
|
119 |
+
"""
|
120 |
+
self.path = path
|
121 |
+
if image_type == "pil":
|
122 |
+
self.imgreader = PILReader()
|
123 |
+
else:
|
124 |
+
self.imgreader = OpenCVReader()
|
125 |
+
|
126 |
+
def get_slides(self):
|
127 |
+
"""
|
128 |
+
Gets the slide from disk and returns them as list of "Slide"
|
129 |
+
objects.
|
130 |
+
:return: The slides stored on disk as list of "Slide" objects.
|
131 |
+
"""
|
132 |
+
slides = []
|
133 |
+
for filename in sorted(os.listdir(self.path), key=numerical_sort):
|
134 |
+
file_path = os.path.join(self.path, filename)
|
135 |
+
_, ext = os.path.splitext(file_path)
|
136 |
+
if not is_image(ext):
|
137 |
+
continue
|
138 |
+
time, _ = os.path.splitext(filename)
|
139 |
+
slide = Slide(time, self.imgreader.get_img(file_path))
|
140 |
+
slides.append(slide)
|
141 |
+
|
142 |
+
return slides
|
143 |
+
|
144 |
+
|
145 |
+
class ImageReader(object):
|
146 |
+
__metaclass__ = ABCMeta
|
147 |
+
|
148 |
+
@abstractmethod
|
149 |
+
def get_img(self, file_path):
|
150 |
+
pass
|
151 |
+
|
152 |
+
|
153 |
+
class PILReader(ImageReader):
|
154 |
+
def get_img(self, file_path):
|
155 |
+
return Image.open(file_path)
|
156 |
+
|
157 |
+
|
158 |
+
class OpenCVReader(ImageReader):
|
159 |
+
def get_img(self, file_path):
|
160 |
+
return cv2.imread(file_path)
|
161 |
+
|
162 |
+
|
163 |
+
def convert_to_opencv(img):
|
164 |
+
return cv2.cvtColor(numpy.array(img.convert('RGB')), cv2.COLOR_RGB2BGR)
|
165 |
+
|
166 |
+
|
167 |
+
def convert_to_PIL(img):
|
168 |
+
return Image.fromarray(img)
|
169 |
+
|
170 |
+
|
171 |
+
def is_image(ext):
|
172 |
+
"""
|
173 |
+
Checks if the file_format is a supported image to read.
|
174 |
+
:param ext: the extension of a file.
|
175 |
+
:return: whether or not the file is a image
|
176 |
+
"""
|
177 |
+
return ext == '.jpeg' or ext == '.png' or ext == '.jpg' or ext == '.bmp'
|
sorter.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
import imgcomparison as ic
|
8 |
+
import mediaoutput
|
9 |
+
import sources
|
10 |
+
from analyzer import Analyzer
|
11 |
+
from slides import SlideDataHelper
|
12 |
+
|
13 |
+
|
14 |
+
class SlideSorter(Analyzer):
|
15 |
+
"""
|
16 |
+
Sorts the slides according to their timestamp.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, source, outpath=None, timetable_loc=None, file_format=".png",
|
20 |
+
comparator=ic.AbsDiffHistComparator(0.98)):
|
21 |
+
"""
|
22 |
+
Default initializer
|
23 |
+
:param path: the path where the slides are located on disk
|
24 |
+
:param comparator: the comparator to determine, if two slides
|
25 |
+
are duplicates.
|
26 |
+
"""
|
27 |
+
self.comparator = comparator
|
28 |
+
self.writer = mediaoutput.NullWriter()
|
29 |
+
if outpath is not None:
|
30 |
+
if timetable_loc is None:
|
31 |
+
timetable_loc = os.path.join(outpath, 'timetable.txt')
|
32 |
+
self.file_format = file_format
|
33 |
+
self.writer = mediaoutput.TimetableWriter(outpath, timetable_loc, self.file_format)
|
34 |
+
self.source = source
|
35 |
+
|
36 |
+
def sort(self):
|
37 |
+
"""
|
38 |
+
Sorting the slides and write the new slides without duplicates
|
39 |
+
but with a timetable to disk.
|
40 |
+
"""
|
41 |
+
slides = []
|
42 |
+
with tqdm(total=len(self.source), desc="Sorting Slides: ") as pbar:
|
43 |
+
for i, slide in self.group_slides():
|
44 |
+
pbar.update(i)
|
45 |
+
if slide is not None:
|
46 |
+
slides.append(slide)
|
47 |
+
|
48 |
+
return slides
|
49 |
+
|
50 |
+
def group_slides(self):
|
51 |
+
"""
|
52 |
+
Groups the slides by eliminating duplicates.
|
53 |
+
:param slides: the list of slides possibly containing duplicates
|
54 |
+
:return: a list of slides without duplicates
|
55 |
+
"""
|
56 |
+
slides = []
|
57 |
+
sorted_slides = []
|
58 |
+
loop_counter = 0
|
59 |
+
page_counter = 1
|
60 |
+
for slide in self.source.contents():
|
61 |
+
slides.append(slide)
|
62 |
+
if slide.marked:
|
63 |
+
continue
|
64 |
+
found = False
|
65 |
+
for other in slides[:-1]:
|
66 |
+
if self.comparator.are_same(slide.img, other.img):
|
67 |
+
found = True
|
68 |
+
if other.marked:
|
69 |
+
other.reference.add_time(slide.time)
|
70 |
+
slide.reference = other.reference
|
71 |
+
slide.marked = True
|
72 |
+
else:
|
73 |
+
slide.reference = other
|
74 |
+
other.add_time(slide.time)
|
75 |
+
slide.marked = True
|
76 |
+
yield loop_counter, None
|
77 |
+
|
78 |
+
if not found:
|
79 |
+
slide.page_number = page_counter
|
80 |
+
yield loop_counter, slide
|
81 |
+
sorted_slides.append(slide)
|
82 |
+
page_counter += 1
|
83 |
+
loop_counter += 1
|
84 |
+
self.writer.write(sorted_slides)
|
85 |
+
self.writer.close()
|
86 |
+
|
87 |
+
def analyze(self):
|
88 |
+
for _, slide in self.group_slides():
|
89 |
+
if slide is None:
|
90 |
+
continue
|
91 |
+
yield slide
|
92 |
+
|
93 |
+
|
94 |
+
if __name__ == '__main__':
|
95 |
+
Parser = argparse.ArgumentParser(description="Slide Sorter")
|
96 |
+
Parser.add_argument("-d", "--inputslides", help="path of the sequentially sorted slides", default="slides/")
|
97 |
+
Parser.add_argument("-o", "--outpath", help="path to output slides", default="unique/", nargs='?')
|
98 |
+
Parser.add_argument("-f", "--fileformat", help="file format of the output images e.g. '.jpg'",
|
99 |
+
default=".jpg", nargs='?')
|
100 |
+
Parser.add_argument("-t", "--timetable",
|
101 |
+
help="path where the timetable should be written (default is the outpath+'timetable.txt')",
|
102 |
+
nargs='?', default=None)
|
103 |
+
Args = Parser.parse_args()
|
104 |
+
if Args.timetable is None:
|
105 |
+
Args.timetable = os.path.join(Args.outpath, "timetable.txt")
|
106 |
+
|
107 |
+
sorter = SlideSorter(sources.ListSource(SlideDataHelper(Args.inputslides).get_slides()), Args.outpath,
|
108 |
+
Args.timetable, Args.fileformat)
|
109 |
+
sorter.sort()
|
sources.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABCMeta, abstractmethod
|
2 |
+
import sys
|
3 |
+
|
4 |
+
|
5 |
+
class Source(object):
|
6 |
+
__metaclass__ = ABCMeta
|
7 |
+
|
8 |
+
@abstractmethod
|
9 |
+
def contents(self):
|
10 |
+
pass
|
11 |
+
|
12 |
+
def __len__(self):
|
13 |
+
return sys.maxint
|
14 |
+
|
15 |
+
class ListSource(Source):
|
16 |
+
def __init__(self, list):
|
17 |
+
self.list = list
|
18 |
+
|
19 |
+
def contents(self):
|
20 |
+
return self.list
|
21 |
+
|
22 |
+
def __len__(self):
|
23 |
+
return len(self.contents())
|
24 |
+
|
25 |
+
|
26 |
+
class AnalyzerSource(Source):
|
27 |
+
def __init__(self, analyzer):
|
28 |
+
self.analyzer = analyzer
|
29 |
+
|
30 |
+
def contents(self):
|
31 |
+
for content in self.analyzer.analyze():
|
32 |
+
yield content
|
timeline.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
|
3 |
+
|
4 |
+
class Timeline(object):
|
5 |
+
"""
|
6 |
+
The Timeline represents a logical sequence of frames, where the
|
7 |
+
rendering of frames from the video stream will be done through
|
8 |
+
lazy evaluation.
|
9 |
+
"""
|
10 |
+
reader_head = 0
|
11 |
+
|
12 |
+
def __init__(self, stream):
|
13 |
+
"""
|
14 |
+
Default Initializer
|
15 |
+
:param stream: the video stream from OpenCV
|
16 |
+
"""
|
17 |
+
self.stream = stream
|
18 |
+
self.len = stream.get(cv2.CAP_PROP_FRAME_COUNT)
|
19 |
+
self.fps = stream.get(cv2.CAP_PROP_FPS)
|
20 |
+
|
21 |
+
def next_frame(self):
|
22 |
+
"""
|
23 |
+
This method reads the next frame from the video stream and
|
24 |
+
append it to the rendered_frames list. It also increments the
|
25 |
+
reader_head by 1.
|
26 |
+
:return: Usually the recently evaluated frame.
|
27 |
+
If the video stream has been completely read, it will return
|
28 |
+
None
|
29 |
+
"""
|
30 |
+
ret, frame = self.stream.read()
|
31 |
+
self.reader_head += 1
|
32 |
+
|
33 |
+
if not ret:
|
34 |
+
return None
|
35 |
+
|
36 |
+
return frame
|
37 |
+
|
38 |
+
def get_frame(self, pos):
|
39 |
+
"""
|
40 |
+
Returns the frame at the given position of the frame sequence
|
41 |
+
:param pos: the position of the frame in the sequence
|
42 |
+
:return: the frame at the specified position
|
43 |
+
"""
|
44 |
+
assert pos >= 0
|
45 |
+
self.stream.set(cv2.CAP_PROP_POS_FRAMES, self.len - 1)
|
46 |
+
_, frame = self.stream.read()
|
47 |
+
self.reader_head = pos + 1
|
48 |
+
return frame
|
49 |
+
|
50 |
+
def get_frames(self, start, end):
|
51 |
+
"""
|
52 |
+
Returns the list of frames at between the specified start and
|
53 |
+
end position in the frame sequence.
|
54 |
+
:param start: Where the frame sequence should start
|
55 |
+
:param end: Where the frame sequence should end
|
56 |
+
:return: the frame sequence from start to end
|
57 |
+
"""
|
58 |
+
assert end >= start
|
59 |
+
assert start >= 0
|
60 |
+
|
61 |
+
result = []
|
62 |
+
for i in xrange(start, end, 1):
|
63 |
+
result.append(self.get_frame(i))
|
64 |
+
return result
|
65 |
+
|
66 |
+
def release_stream(self):
|
67 |
+
self.stream.release()
|
68 |
+
|
69 |
+
|
70 |
+
class SlidingWindow(object):
|
71 |
+
"""
|
72 |
+
This class represents an adaptive sliding window. Meaning
|
73 |
+
that it has a pointer to the start position of the window
|
74 |
+
and its size. The size of the window can be changed at any
|
75 |
+
time. Move operations and shrink and expand operations are
|
76 |
+
included.
|
77 |
+
"""
|
78 |
+
|
79 |
+
def __init__(self, timeline, pos=0, size=2):
|
80 |
+
"""
|
81 |
+
Default Initializer for the sliding window
|
82 |
+
:param timeline: the timeline where the sliding window
|
83 |
+
should be applied
|
84 |
+
:param pos: the position where the beginning of the
|
85 |
+
window points to
|
86 |
+
:param size: the size of the window
|
87 |
+
"""
|
88 |
+
self.timeline = timeline
|
89 |
+
self.pos = pos
|
90 |
+
self.size = size
|
91 |
+
|
92 |
+
def move_right(self):
|
93 |
+
"""
|
94 |
+
This method does this:
|
95 |
+
β|β|β|β|β|β => β|β|β|β|β|β
|
96 |
+
1 2 3 4 5 6 1 2 3 4 5 6
|
97 |
+
:return: the changed list of frame
|
98 |
+
"""
|
99 |
+
self.pos += 1
|
100 |
+
|
101 |
+
def move_left(self):
|
102 |
+
"""
|
103 |
+
This method does this:
|
104 |
+
β|β|β|β|β|β => β|β|β|β|β|β
|
105 |
+
1 2 3 4 5 6 1 2 3 4 5 6
|
106 |
+
:return: the changed list of frame
|
107 |
+
"""
|
108 |
+
self.pos -= 1
|
109 |
+
|
110 |
+
def shrink_from_left(self):
|
111 |
+
"""
|
112 |
+
This method does this:
|
113 |
+
β|β|β|β|β|β => β|β|β|β|β|β
|
114 |
+
1 2 3 4 5 6 1 2 3 4 5 6
|
115 |
+
:return: the changed list of frame
|
116 |
+
"""
|
117 |
+
self.pos += 1
|
118 |
+
self.size -= 1
|
119 |
+
|
120 |
+
def shrink_from_right(self):
|
121 |
+
"""
|
122 |
+
This method does this:
|
123 |
+
β|β|β|β|β|β => β|β|β|β|β|β
|
124 |
+
1 2 3 4 5 6 1 2 3 4 5 6
|
125 |
+
:return: the changed list of frame
|
126 |
+
"""
|
127 |
+
self.size -= 1
|
128 |
+
|
129 |
+
def expand_to_left(self):
|
130 |
+
"""
|
131 |
+
This method does this:
|
132 |
+
β|β|β|β|β|β => β|β|β|β|β|β
|
133 |
+
1 2 3 4 5 6 1 2 3 4 5 6
|
134 |
+
:return: the changed list of frame
|
135 |
+
"""
|
136 |
+
self.pos -= 1
|
137 |
+
self.size += 1
|
138 |
+
|
139 |
+
def expand_to_right(self):
|
140 |
+
"""
|
141 |
+
This method does$$ this:
|
142 |
+
β|β|β|β|β|β => β|β|β|β|β|β
|
143 |
+
1 2 3 4 5 6 1 2 3 4 5 6
|
144 |
+
:return: the changed list of frame
|
145 |
+
"""
|
146 |
+
self.size += 1
|
147 |
+
|
148 |
+
def get_frames(self):
|
149 |
+
"""
|
150 |
+
Retrieves all the frames that are currently in this adaptive
|
151 |
+
sliding window.
|
152 |
+
:return: the frames in the sliding window
|
153 |
+
"""
|
154 |
+
return self.timeline.get_frames(self.pos, self.pos + self.size)
|
155 |
+
|
156 |
+
def get_frame(self, pos):
|
157 |
+
return self.timeline.get_frame(self.pos)
|
158 |
+
|
159 |
+
def get_start_frame(self):
|
160 |
+
return self.timeline.get_frame(self.pos)
|
161 |
+
|
162 |
+
def get_end_frame(self):
|
163 |
+
return self.timeline.get_frame(self.pos + self.size - 1)
|
164 |
+
|
165 |
+
def at_end(self):
|
166 |
+
return self.pos + self.size == self.timeline.len
|