Spaces:
Sleeping
Sleeping
File size: 3,376 Bytes
2c6b1df f34392a ca190b4 f34392a 2c6b1df ca190b4 2c6b1df ca190b4 f34392a 77f3032 ca190b4 f34392a ca190b4 f34392a 471e321 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
import pandas as pd
import torch
from googletrans import Translator
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration
from transformers import BartForConditionalGeneration
from transformers import BartTokenizer
from transformers import pipeline
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
from transformers import PreTrainedModel
from transformers import PreTrainedTokenizer
# Question launcher
class E2EQGPipeline:
def __init__(
self,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizer
):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = model
self.tokenizer = tokenizer
self.model_type = "t5"
self.kwargs = {
"max_length": 256,
"num_beams": 4,
"length_penalty": 1.5,
"no_repeat_ngram_size": 3,
"early_stopping": True,
}
def generate_questions(self, context: str):
inputs = self._prepare_inputs_for_e2e_qg(context)
outs = self.model.generate(
input_ids=inputs['input_ids'].to(self.device),
attention_mask=inputs['attention_mask'].to(self.device),
**self.kwargs
)
prediction = self.tokenizer.decode(outs[0], skip_special_tokens=True)
questions = prediction.split("<sep>")
questions = [question.strip() for question in questions[:-1]]
return questions
def _prepare_inputs_for_e2e_qg(self, context):
source_text = f"generate questions: {context}"
inputs = self._tokenize([source_text], padding=False)
return inputs
def _tokenize(
self,
inputs,
padding=True,
truncation=True,
add_special_tokens=True,
max_length=512
):
inputs = self.tokenizer.batch_encode_plus(
inputs,
max_length=max_length,
add_special_tokens=add_special_tokens,
truncation=truncation,
padding="max_length" if padding else False,
pad_to_max_length=padding,
return_tensors="pt"
)
return inputs
def generate_questions(text):
qg_model = T5ForConditionalGeneration.from_pretrained('valhalla/t5-base-e2e-qg')
qg_tokenizer = T5Tokenizer.from_pretrained('valhalla/t5-base-e2e-qg')
qg_final_model = E2EQGPipeline(qg_model, qg_tokenizer)
questions = qg_final_model.generate_questions(text)
translator = Translator()
translated_questions = [translator.translate(question, dest='es').text for question in questions]
return translated_questions
def generate_summary(text):
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def process(text):
return generate_summary(text), generate_questions(text)
textbox = gr.Textbox(label="Pega el text aca:", placeholder="Texto...", lines=15)
demo = gr.Interface(fn=process, inputs=textbox, outputs=["text", "text"])
demo.launch() |