igoracmorais commited on
Commit
ba5b254
·
verified ·
1 Parent(s): d5ae2c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -7
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import PyPDF2
2
  import gradio as gr
3
  import json
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
 
6
  # Função para extrair texto do PDF
7
  def extract_text_from_pdf(pdf_file):
@@ -11,8 +11,8 @@ def extract_text_from_pdf(pdf_file):
11
  text += page.extract_text()
12
  return text
13
 
14
- # Função para gerar perguntas e respostas usando um modelo da Hugging Face
15
- def generate_qa_pairs(text):
16
  tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl")
17
  model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl")
18
 
@@ -20,9 +20,19 @@ def generate_qa_pairs(text):
20
  outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
21
  questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
22
 
23
- # O modelo retorna apenas as perguntas, então precisamos criar respostas fictícias para o exemplo
24
- qas = [{"question": question, "answer": "answer", "answer_start": 0} for question in questions]
25
-
 
 
 
 
 
 
 
 
 
 
26
  return qas
27
 
28
  # Função para converter os pares de QA no formato SQuAD
@@ -53,7 +63,8 @@ def save_to_json(data, file_name):
53
  # Função principal para ser usada no Gradio
54
  def process_pdf(pdf_file, file_name):
55
  context = extract_text_from_pdf(pdf_file)
56
- qas = generate_qa_pairs(context)
 
57
  squad_data = convert_to_squad_format(qas, context)
58
  file_path = save_to_json(squad_data, file_name)
59
  return file_path
 
1
  import PyPDF2
2
  import gradio as gr
3
  import json
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
5
 
6
  # Função para extrair texto do PDF
7
  def extract_text_from_pdf(pdf_file):
 
11
  text += page.extract_text()
12
  return text
13
 
14
+ # Função para gerar perguntas usando um modelo da Hugging Face
15
+ def generate_questions(text):
16
  tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl")
17
  model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl")
18
 
 
20
  outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
21
  questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
22
 
23
+ return questions
24
+
25
+ # Função para responder perguntas usando um pipeline de perguntas e respostas
26
+ def answer_questions(context, questions):
27
+ qa_pipeline = pipeline("question-answering")
28
+ qas = []
29
+ for question in questions:
30
+ answer = qa_pipeline(question=question, context=context)
31
+ qas.append({
32
+ "question": question,
33
+ "answer": answer['answer'],
34
+ "answer_start": answer['start']
35
+ })
36
  return qas
37
 
38
  # Função para converter os pares de QA no formato SQuAD
 
63
  # Função principal para ser usada no Gradio
64
  def process_pdf(pdf_file, file_name):
65
  context = extract_text_from_pdf(pdf_file)
66
+ questions = generate_questions(context)
67
+ qas = answer_questions(context, questions)
68
  squad_data = convert_to_squad_format(qas, context)
69
  file_path = save_to_json(squad_data, file_name)
70
  return file_path