Spaces:

leetuan023
/

videoocr2

Configuration error

App Files Files Community

leetuan023 commited on Sep 23, 2024

Commit

7d79f10

verified ·

1 Parent(s): 53d654e

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -207

app.py DELETED Viewed

@@ -1,207 +0,0 @@
-import gradio as gr
-import pytesseract
-import cv2
-import multiprocessing
-from fuzzywuzzy import fuzz
-from dataclasses import dataclass
-from urllib.request import urlopen
-import shutil
-import pathlib
-import datetime
-import sys
-# Constants
-TESSDATA_DIR = pathlib.Path.home() / 'tessdata'
-TESSDATA_URL = 'https://github.com/tesseract-ocr/tessdata_fast/raw/master/{}.traineddata'
-TESSDATA_SCRIPT_URL = 'https://github.com/tesseract-ocr/tessdata_best/raw/master/script/{}.traineddata'
-# Download language data files if necessary
-def download_lang_data(lang: str):
-    TESSDATA_DIR.mkdir(parents=True, exist_ok=True)
-    for lang_name in lang.split('+'):
-        filepath = TESSDATA_DIR / f'{lang_name}.traineddata'
-        if not filepath.is_file():
-            url = TESSDATA_SCRIPT_URL.format(lang_name) if lang_name[0].isupper() else TESSDATA_URL.format(lang_name)
-            with urlopen(url) as res, open(filepath, 'w+b') as f:
-                shutil.copyfileobj(res, f)
-# Helper functions for time and frame conversion
-def get_frame_index(time_str: str, fps: float):
-    t = list(map(float, time_str.split(':')))
-    if len(t) == 3:
-        td = datetime.timedelta(hours=t[0], minutes=t[1], seconds=t[2])
-    elif len(t) == 2:
-        td = datetime.timedelta(minutes=t[0], seconds=t[1])
-    else:
-        raise ValueError(f'Time data "{time_str}" does not match format "%H:%M:%S"')
-    return int(td.total_seconds() * fps)
-def get_srt_timestamp(frame_index: int, fps: float):
-    td = datetime.timedelta(seconds=frame_index / fps)
-    ms = td.microseconds // 1000
-    m, s = divmod(td.seconds, 60)
-    h, m = divmod(m, 60)
-    return f'{h:02d}:{m:02d}:{s:02d},{ms:03d}'
-# Video capture class using OpenCV
-class Capture:
-    def __init__(self, video_path):
-        self.path = video_path
-    def __enter__(self):
-        self.cap = cv2.VideoCapture(self.path)
-        if not self.cap.isOpened():
-            raise IOError(f'Cannot open video {self.path}.')
-        return self.cap
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.cap.release()
-@dataclass
-class PredictedWord:
-    confidence: int
-    text: str
-class PredictedFrame:
-    def __init__(self, index: int, pred_data: str, conf_threshold: int):
-        self.index = index
-        self.words = []
-        block = 0
-        for l in pred_data.splitlines()[1:]:
-            word_data = l.split()
-            if len(word_data) < 12:
-                continue
-            _, _, block_num, *_, conf, text = word_data
-            block_num, conf = int(block_num), int(conf)
-            if block < block_num:
-                block = block_num
-                if self.words and self.words[-1].text != '\n':
-                    self.words.append(PredictedWord(0, '\n'))
-            if conf >= conf_threshold:
-                self.words.append(PredictedWord(conf, text))
-        self.confidence = sum(word.confidence for word in self.words)
-        self.text = ' '.join(word.text for word in self.words).translate(str.maketrans('|', 'I', '<>{}[];`@#$%^*_=~\\')).replace(' \n ', '\n').strip()
-    def is_similar_to(self, other, threshold=70):
-        return fuzz.ratio(self.text, other.text) >= threshold
-class PredictedSubtitle:
-    def __init__(self, frames, sim_threshold):
-        self.frames = [f for f in frames if f.confidence > 0]
-        self.sim_threshold = sim_threshold
-        self.text = max(self.frames, key=lambda f: f.confidence).text if self.frames else ''
-    @property
-    def index_start(self):
-        return self.frames[0].index if self.frames else 0
-    @property
-    def index_end(self):
-        return self.frames[-1].index if self.frames else 0
-    def is_similar_to(self, other):
-        return fuzz.partial_ratio(self.text, other.text) >= self.sim_threshold
-class Video:
-    def __init__(self, path):
-        self.path = path
-        with Capture(path) as v:
-            self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT))
-            self.fps = v.get(cv2.CAP_PROP_FPS)
-            self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    def run_ocr(self, lang, time_start, time_end, conf_threshold, use_fullframe):
-        self.lang = lang
-        self.use_fullframe = use_fullframe
-        ocr_start = get_frame_index(time_start, self.fps) if time_start else 0
-        ocr_end = get_frame_index(time_end, self.fps) if time_end else self.num_frames
-        if ocr_end < ocr_start:
-            raise ValueError('time_start is later than time_end')
-        num_ocr_frames = ocr_end - ocr_start
-        with Capture(self.path) as v, multiprocessing.Pool() as pool:
-            v.set(cv2.CAP_PROP_POS_FRAMES, ocr_start)
-            frames = (v.read()[1] for _ in range(num_ocr_frames))
-            it_ocr = pool.imap(self._image_to_data, frames, chunksize=10)
-            self.pred_frames = [PredictedFrame(i + ocr_start, data, conf_threshold) for i, data in enumerate(it_ocr)]
-    def _image_to_data(self, img):
-        if not self.use_fullframe:
-            img = img[self.height // 2:, :]
-        config = f'--tessdata-dir "{TESSDATA_DIR}"'
-        try:
-            return pytesseract.image_to_data(img, lang=self.lang, config=config)
-        except Exception as e:
-            sys.exit(f'{e.__class__.__name__}: {e}')
-    def get_subtitles(self, sim_threshold):
-        self._generate_subtitles(sim_threshold)
-        return ''.join(f'{i}\n{get_srt_timestamp(sub.index_start, self.fps)} --> {get_srt_timestamp(sub.index_end, self.fps)}\n{sub.text}\n\n' for i, sub in enumerate(self.pred_subs))
-    def _generate_subtitles(self, sim_threshold):
-        self.pred_subs = []
-        if self.pred_frames is None:
-            raise AttributeError('Please call self.run_ocr() first to perform OCR on frames')
-        WIN_BOUND = int(self.fps // 2)
-        bound = WIN_BOUND
-        i = 0
-        j = 1
-        while j < len(self.pred_frames):
-            fi, fj = self.pred_frames[i], self.pred_frames[j]
-            if fi.is_similar_to(fj):
-                bound = WIN_BOUND
-            elif bound > 0:
-                bound -= 1
-            else:
-                para_new = j - WIN_BOUND
-                self._append_sub(PredictedSubtitle(self.pred_frames[i:para_new], sim_threshold))
-                i = para_new
-                j = i
-                bound = WIN_BOUND
-            j += 1
-        if i < len(self.pred_frames) - 1:
-            self._append_sub(PredictedSubtitle(self.pred_frames[i:], sim_threshold))
-    def _append_sub(self, sub):
-        if not sub.text:
-            return
-        while self.pred_subs and sub.is_similar_to(self.pred_subs[-1]):
-            ls = self.pred_subs.pop()
-            sub = PredictedSubtitle(ls.frames + sub.frames, sub.sim_threshold)
-        self.pred_subs.append(sub)
-# Gradio app
-def extract_subtitles(video_file, lang, time_start, time_end, conf_threshold, use_fullframe, sim_threshold):
-    video = Video(video_file.name)
-    video.run_ocr(lang, time_start, time_end, conf_threshold, use_fullframe)
-    subtitles = video.get_subtitles(sim_threshold)
-    return subtitles
-iface = gr.Interface(
-    fn=extract_subtitles,
-    inputs=[
-        gr.Video(label="Video File"),
-        gr.Textbox(value='eng', label="OCR Language"),
-        gr.Textbox(value='00:00:00', label="Start Time (HH:MM:SS)"),
-        gr.Textbox(value='', label="End Time (HH:MM:SS, leave empty for full video)"),
-        gr.Slider(0, 100, value=60, step=1, label="Confidence Threshold"),
-        gr.Checkbox(label="Use Full Frame for OCR", default=False),
-        gr.Slider(0, 100, value=70, step=1, label="Similarity Threshold")
-    ],
-    outputs=gr.Textbox(label="Extracted Subtitles"),
-    title="Video Subtitle Extractor",
-    description="Extract hardcoded subtitles from videos using machine learning.",
-)
-iface.launch()