Spaces:
Running
Running
File size: 2,040 Bytes
9150552 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from youtube_transcript_api import YouTubeTranscriptApi as ytapi
from youtube_transcript_api.formatters import TextFormatter
import json
def getSubsText(video_id="", getGenerated=False):
tList = ytapi.list_transcripts(video_id)
if getGenerated:
# TODO: implement getGenerated
pass
for t in tList:
data = t.fetch()
return (TextFormatter().format_transcript(data)).replace("\n", " ")
def getSubs(video_id="", getGenerated=False, chunker=None):
tList = ytapi.list_transcripts(video_id)
if getGenerated:
pass
for t in tList:
data = t.fetch()
return data
class subs:
def __init__(self, video_id="", generated=False):
self.video_id = video_id
self.generated = generated
self.subs = getSubs(video_id, generated)
def __sizeof__(self) -> int:
count = 0
for _ in self.subs:
count += 1
return count
def getText(self):
return (TextFormatter().format_transcript(self.subs)).replace("\n", " ")
def getSubs(self):
subs = self.subs
# [chunk, duration]
c_d_subs = '\n'.join(f"{subs['text']}:::{subs['duration']}" for subs in subs)
return c_d_subs
def getSubsRaw(self):
return self.subs
def getSubsList(self, size=100):
subs = json.loads(json.dumps(self.subs))
chunks = []
current_chunk = "" # limited to {size}
current_duaration = 0 # TODO: add better variable name
for subline in subs:
current_duaration = subline["start"]
if len(current_chunk) + len(subline["text"]) + 1 <= size:
current_chunk += f"{subline['text']} "
else:
chunks.append(
[
current_chunk.strip(),
current_duaration
]
)
current_chunk = f"{subline['text']} "
return chunks
|