Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import PyPDF2
|
2 |
import gradio as gr
|
3 |
import json
|
4 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
5 |
|
6 |
# Função para extrair texto do PDF
|
7 |
def extract_text_from_pdf(pdf_file):
|
@@ -11,8 +11,8 @@ def extract_text_from_pdf(pdf_file):
|
|
11 |
text += page.extract_text()
|
12 |
return text
|
13 |
|
14 |
-
# Função para gerar perguntas
|
15 |
-
def
|
16 |
tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl")
|
17 |
model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl")
|
18 |
|
@@ -20,9 +20,19 @@ def generate_qa_pairs(text):
|
|
20 |
outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
|
21 |
questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
return qas
|
27 |
|
28 |
# Função para converter os pares de QA no formato SQuAD
|
@@ -53,7 +63,8 @@ def save_to_json(data, file_name):
|
|
53 |
# Função principal para ser usada no Gradio
|
54 |
def process_pdf(pdf_file, file_name):
|
55 |
context = extract_text_from_pdf(pdf_file)
|
56 |
-
|
|
|
57 |
squad_data = convert_to_squad_format(qas, context)
|
58 |
file_path = save_to_json(squad_data, file_name)
|
59 |
return file_path
|
|
|
1 |
import PyPDF2
|
2 |
import gradio as gr
|
3 |
import json
|
4 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
5 |
|
6 |
# Função para extrair texto do PDF
|
7 |
def extract_text_from_pdf(pdf_file):
|
|
|
11 |
text += page.extract_text()
|
12 |
return text
|
13 |
|
14 |
+
# Função para gerar perguntas usando um modelo da Hugging Face
|
15 |
+
def generate_questions(text):
|
16 |
tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl")
|
17 |
model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl")
|
18 |
|
|
|
20 |
outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
|
21 |
questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
|
22 |
|
23 |
+
return questions
|
24 |
+
|
25 |
+
# Função para responder perguntas usando um pipeline de perguntas e respostas
|
26 |
+
def answer_questions(context, questions):
|
27 |
+
qa_pipeline = pipeline("question-answering")
|
28 |
+
qas = []
|
29 |
+
for question in questions:
|
30 |
+
answer = qa_pipeline(question=question, context=context)
|
31 |
+
qas.append({
|
32 |
+
"question": question,
|
33 |
+
"answer": answer['answer'],
|
34 |
+
"answer_start": answer['start']
|
35 |
+
})
|
36 |
return qas
|
37 |
|
38 |
# Função para converter os pares de QA no formato SQuAD
|
|
|
63 |
# Função principal para ser usada no Gradio
|
64 |
def process_pdf(pdf_file, file_name):
|
65 |
context = extract_text_from_pdf(pdf_file)
|
66 |
+
questions = generate_questions(context)
|
67 |
+
qas = answer_questions(context, questions)
|
68 |
squad_data = convert_to_squad_format(qas, context)
|
69 |
file_path = save_to_json(squad_data, file_name)
|
70 |
return file_path
|