minnehwg commited on
Commit
915c9b5
·
verified ·
1 Parent(s): f213cf3

Update util.py

Browse files
Files changed (1) hide show
  1. util.py +2 -3
util.py CHANGED
@@ -37,6 +37,7 @@ def get_subtitles(video_url):
37
  video_id = video_url.split("v=")[1]
38
  transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
39
  subs = " ".join(entry['text'] for entry in transcript)
 
40
 
41
  return transcript, subs
42
 
@@ -95,14 +96,12 @@ def split_into_chunks(text, max_words=800, overlap_sentences=2):
95
  else:
96
  if len(current_chunk) >= overlap_sentences:
97
  overlap = current_chunk[-overlap_sentences:]
98
- print(f"Overlapping sentences: {' '.join(overlap)}")
99
  chunks.append(' '.join(current_chunk))
100
  current_chunk = current_chunk[-overlap_sentences:] + [sentence]
101
  current_word_count = sum(len(sent.split()) for sent in current_chunk)
102
  if current_chunk:
103
  if len(current_chunk) >= overlap_sentences:
104
  overlap = current_chunk[-overlap_sentences:]
105
- print(f"Overlapping sentences: {' '.join(overlap)}")
106
  chunks.append(' '.join(current_chunk))
107
 
108
  return chunks
@@ -130,7 +129,7 @@ def pipeline(url, model, tokenizer):
130
  chunks = split_into_chunks(vie_sub, 700, 2)
131
  sum_para = []
132
  for i in chunks:
133
- tmp = summarize(i, model, tokenizer, num_beams=5)
134
  sum_para.append(tmp)
135
  sum = ''.join(sum_para)
136
  del sub, vie_sub, sum_para, chunks
 
37
  video_id = video_url.split("v=")[1]
38
  transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
39
  subs = " ".join(entry['text'] for entry in transcript)
40
+ print(sub)
41
 
42
  return transcript, subs
43
 
 
96
  else:
97
  if len(current_chunk) >= overlap_sentences:
98
  overlap = current_chunk[-overlap_sentences:]
 
99
  chunks.append(' '.join(current_chunk))
100
  current_chunk = current_chunk[-overlap_sentences:] + [sentence]
101
  current_word_count = sum(len(sent.split()) for sent in current_chunk)
102
  if current_chunk:
103
  if len(current_chunk) >= overlap_sentences:
104
  overlap = current_chunk[-overlap_sentences:]
 
105
  chunks.append(' '.join(current_chunk))
106
 
107
  return chunks
 
129
  chunks = split_into_chunks(vie_sub, 700, 2)
130
  sum_para = []
131
  for i in chunks:
132
+ tmp = summarize(i, model, tokenizer, num_beams=3)
133
  sum_para.append(tmp)
134
  sum = ''.join(sum_para)
135
  del sub, vie_sub, sum_para, chunks