Upload 2 files
Browse files- app.py +26 -17
- requirements.txt +1 -0
app.py
CHANGED
@@ -3,29 +3,30 @@ import gradio as gr
|
|
3 |
import torch
|
4 |
from peft import PeftModel, PeftConfig
|
5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
|
6 |
|
7 |
-
def load_data(file_obj):
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
|
15 |
-
|
16 |
|
17 |
def preprocessing(data):
|
18 |
texts = list()
|
19 |
|
20 |
i = 0
|
21 |
-
if len(data) <= i+
|
22 |
texts = data
|
23 |
else:
|
24 |
while len(data[i:]) != 0:
|
25 |
-
if len(data[i:]) >
|
26 |
-
string = str(data[i:i+
|
27 |
texts.append(string)
|
28 |
-
i = i +
|
29 |
else:
|
30 |
string = str(data[i:])
|
31 |
texts.append(string)
|
@@ -40,13 +41,20 @@ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
|
|
40 |
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map='auto') # load_in_8bit=True,
|
41 |
model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
|
42 |
|
43 |
-
def summarize(
|
44 |
-
transcript = load_data(file_obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
texts = preprocessing(transcript)
|
46 |
inputs = tokenizer(texts, return_tensors="pt", padding=True, )
|
47 |
|
48 |
with torch.no_grad():
|
49 |
-
output_tokens = model.generate(input_ids=inputs["input_ids"].to("
|
50 |
outputs = tokenizer.batch_decode(output_tokens.detach().cpu().numpy(), skip_special_tokens=True)
|
51 |
|
52 |
return outputs
|
@@ -54,6 +62,7 @@ def summarize(file_obj):
|
|
54 |
gr.Interface(
|
55 |
fn=summarize,
|
56 |
title = 'Summarize Transcripts',
|
57 |
-
inputs = gr.File(file_types=["text"], label="Upload a text file.", interactive=True),
|
|
|
58 |
outputs = gr.Textbox(label="Summary", max_lines=120, interactive=False),
|
59 |
-
).launch()
|
|
|
3 |
import torch
|
4 |
from peft import PeftModel, PeftConfig
|
5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
6 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
7 |
|
8 |
+
# def load_data(file_obj):
|
9 |
+
# """
|
10 |
+
# Load data from the file object of the gr.File() inputs
|
11 |
+
# """
|
12 |
+
# path = file_obj.name
|
13 |
+
# with open(path, "r") as f:
|
14 |
+
# data = f.read()
|
15 |
|
16 |
+
# return data
|
17 |
|
18 |
def preprocessing(data):
|
19 |
texts = list()
|
20 |
|
21 |
i = 0
|
22 |
+
if len(data) <= i+3000:
|
23 |
texts = data
|
24 |
else:
|
25 |
while len(data[i:]) != 0:
|
26 |
+
if len(data[i:]) > 3000:
|
27 |
+
string = str(data[i:i+3000])
|
28 |
texts.append(string)
|
29 |
+
i = i + 2800
|
30 |
else:
|
31 |
string = str(data[i:])
|
32 |
texts.append(string)
|
|
|
41 |
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map='auto') # load_in_8bit=True,
|
42 |
model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
|
43 |
|
44 |
+
def summarize(video_id):
|
45 |
+
# transcript = load_data(file_obj)
|
46 |
+
dict = YouTubeTranscriptApi.get_transcript(video_id)
|
47 |
+
|
48 |
+
transcript = ""
|
49 |
+
|
50 |
+
for i in range(len(dict)):
|
51 |
+
transcript += dict[i]['text']
|
52 |
+
|
53 |
texts = preprocessing(transcript)
|
54 |
inputs = tokenizer(texts, return_tensors="pt", padding=True, )
|
55 |
|
56 |
with torch.no_grad():
|
57 |
+
output_tokens = model.generate(input_ids=inputs["input_ids"].to("device"), max_new_tokens=60, do_sample=True, top_p=0.9)
|
58 |
outputs = tokenizer.batch_decode(output_tokens.detach().cpu().numpy(), skip_special_tokens=True)
|
59 |
|
60 |
return outputs
|
|
|
62 |
gr.Interface(
|
63 |
fn=summarize,
|
64 |
title = 'Summarize Transcripts',
|
65 |
+
# inputs = gr.File(file_types=["text"], label="Upload a text file.", interactive=True),
|
66 |
+
inputs = gr.Textbox(label="Video_ID", interactive=True),
|
67 |
outputs = gr.Textbox(label="Summary", max_lines=120, interactive=False),
|
68 |
+
).launch(debug=True)
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
peft
|
2 |
transformers==4.27.2
|
3 |
gradio
|
|
|
|
1 |
peft
|
2 |
transformers==4.27.2
|
3 |
gradio
|
4 |
+
youtube_transcript_api
|