igoracmorais commited on
Commit
65cef22
·
verified ·
1 Parent(s): a7b5b59

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import gradio as gr
3
+ import json
4
+ from transformers import pipeline
5
+ from datasets import DatasetDict, Dataset
6
+
7
+ # Função para extrair texto do PDF
8
+ def extract_text_from_pdf(pdf_file):
9
+ reader = PyPDF2.PdfFileReader(pdf_file)
10
+ text = ""
11
+ for page in range(reader.numPages):
12
+ text += reader.getPage(page).extract_text()
13
+ return text
14
+
15
+ # Função para gerar perguntas e respostas usando o pipeline da Hugging Face
16
+ def generate_qa_pairs(text):
17
+ qa_pipeline = pipeline("question-generation")
18
+ qas = qa_pipeline(text)
19
+ return qas
20
+
21
+ # Função para converter os pares de QA no formato SQuAD
22
+ def convert_to_squad_format(qas, context):
23
+ squad_data = []
24
+ for i, qa in enumerate(qas):
25
+ entry = {
26
+ "title": "Generated Data",
27
+ "context": context,
28
+ "question": qa['question'],
29
+ "id": str(i),
30
+ "answers": {
31
+ "answer_start": [qa['answer']['start']],
32
+ "text": [qa['answer']['text']]
33
+ }
34
+ }
35
+ squad_data.append(entry)
36
+ return squad_data
37
+
38
+ # Função para salvar os dados no formato SQuAD
39
+ def save_to_json(data, file_name):
40
+ if not file_name.endswith(".json"):
41
+ file_name += ".json"
42
+ with open(file_name, "w", encoding='utf-8') as f:
43
+ json.dump(data, f, ensure_ascii=False, indent=4)
44
+ return file_name
45
+
46
+ # Função principal para ser usada no Gradio
47
+ def process_pdf(pdf_file, file_name):
48
+ context = extract_text_from_pdf(pdf_file)
49
+ qas = generate_qa_pairs(context)
50
+ squad_data = convert_to_squad_format(qas, context)
51
+ file_path = save_to_json(squad_data, file_name)
52
+ return file_path
53
+
54
+ # Interface Gradio
55
+ with gr.Blocks() as demo:
56
+ with gr.Row():
57
+ pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
58
+ file_name = gr.Textbox(label="Output JSON File Name", value="squad_dataset")
59
+ process_button = gr.Button("Process PDF")
60
+ download_link = gr.File(label="Download JSON", interactive=False)
61
+
62
+ process_button.click(fn=process_pdf, inputs=[pdf_file, file_name], outputs=download_link)
63
+
64
+ demo.launch()