Spaces:
Sleeping
Sleeping
File size: 1,338 Bytes
e6d0f4c 9eb4d18 3eec261 e6d0f4c 9eb4d18 e6d0f4c 9eb4d18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import re
from extractdata import extract_text
from wordcloudplot import plot_wordcloud
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
def summarize(input_, model):
if input_.split("/")[0] == "https:":
text = extract_text(input)
else:
text = input_
if model == "T5":
checkpoint = "csebuetnlp/mT5_multilingual_XLSum"
elif model == "BART":
checkpoint = "ai4bharat/IndicBART"
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
input_ids = tokenizer(
[WHITESPACE_HANDLER(text)],
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=512 )["input_ids"]
output_ids = model.generate(
input_ids=input_ids,
max_length=70,
min_length=30,
no_repeat_ngram_size=2,
num_beams=4 )[0]
summary = tokenizer.decode(
output_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
figure = plot_wordcloud(text)
return summary, figure |