sooolee commited on
Commit
b6f4778
·
1 Parent(s): 92f9c99

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +26 -17
  2. requirements.txt +1 -0
app.py CHANGED
@@ -3,29 +3,30 @@ import gradio as gr
3
  import torch
4
  from peft import PeftModel, PeftConfig
5
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 
6
 
7
- def load_data(file_obj):
8
- """
9
- Load data from the file object of the gr.File() inputs
10
- """
11
- path = file_obj.name
12
- with open(path, "r") as f:
13
- data = f.read()
14
 
15
- return data
16
 
17
  def preprocessing(data):
18
  texts = list()
19
 
20
  i = 0
21
- if len(data) <= i+4000:
22
  texts = data
23
  else:
24
  while len(data[i:]) != 0:
25
- if len(data[i:]) > 4000:
26
- string = str(data[i:i+4000])
27
  texts.append(string)
28
- i = i + 3800
29
  else:
30
  string = str(data[i:])
31
  texts.append(string)
@@ -40,13 +41,20 @@ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
40
  model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map='auto') # load_in_8bit=True,
41
  model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
42
 
43
- def summarize(file_obj):
44
- transcript = load_data(file_obj)
 
 
 
 
 
 
 
45
  texts = preprocessing(transcript)
46
  inputs = tokenizer(texts, return_tensors="pt", padding=True, )
47
 
48
  with torch.no_grad():
49
- output_tokens = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=60, do_sample=True, top_p=0.9)
50
  outputs = tokenizer.batch_decode(output_tokens.detach().cpu().numpy(), skip_special_tokens=True)
51
 
52
  return outputs
@@ -54,6 +62,7 @@ def summarize(file_obj):
54
  gr.Interface(
55
  fn=summarize,
56
  title = 'Summarize Transcripts',
57
- inputs = gr.File(file_types=["text"], label="Upload a text file.", interactive=True),
 
58
  outputs = gr.Textbox(label="Summary", max_lines=120, interactive=False),
59
- ).launch()
 
3
  import torch
4
  from peft import PeftModel, PeftConfig
5
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
+ from youtube_transcript_api import YouTubeTranscriptApi
7
 
8
+ # def load_data(file_obj):
9
+ # """
10
+ # Load data from the file object of the gr.File() inputs
11
+ # """
12
+ # path = file_obj.name
13
+ # with open(path, "r") as f:
14
+ # data = f.read()
15
 
16
+ # return data
17
 
18
  def preprocessing(data):
19
  texts = list()
20
 
21
  i = 0
22
+ if len(data) <= i+3000:
23
  texts = data
24
  else:
25
  while len(data[i:]) != 0:
26
+ if len(data[i:]) > 3000:
27
+ string = str(data[i:i+3000])
28
  texts.append(string)
29
+ i = i + 2800
30
  else:
31
  string = str(data[i:])
32
  texts.append(string)
 
41
  model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map='auto') # load_in_8bit=True,
42
  model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
43
 
44
+ def summarize(video_id):
45
+ # transcript = load_data(file_obj)
46
+ dict = YouTubeTranscriptApi.get_transcript(video_id)
47
+
48
+ transcript = ""
49
+
50
+ for i in range(len(dict)):
51
+ transcript += dict[i]['text']
52
+
53
  texts = preprocessing(transcript)
54
  inputs = tokenizer(texts, return_tensors="pt", padding=True, )
55
 
56
  with torch.no_grad():
57
+ output_tokens = model.generate(input_ids=inputs["input_ids"].to("device"), max_new_tokens=60, do_sample=True, top_p=0.9)
58
  outputs = tokenizer.batch_decode(output_tokens.detach().cpu().numpy(), skip_special_tokens=True)
59
 
60
  return outputs
 
62
  gr.Interface(
63
  fn=summarize,
64
  title = 'Summarize Transcripts',
65
+ # inputs = gr.File(file_types=["text"], label="Upload a text file.", interactive=True),
66
+ inputs = gr.Textbox(label="Video_ID", interactive=True),
67
  outputs = gr.Textbox(label="Summary", max_lines=120, interactive=False),
68
+ ).launch(debug=True)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  peft
2
  transformers==4.27.2
3
  gradio
 
 
1
  peft
2
  transformers==4.27.2
3
  gradio
4
+ youtube_transcript_api