srinivas-mushroom commited on
Commit
bf3d25c
·
1 Parent(s): 394becd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -27
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
- import requests
3
  import io
4
- import json
 
5
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering
6
 
7
  # Download and load pre-trained model and tokenizer
@@ -9,34 +10,36 @@ model_name = "distilbert-base-cased-distilled-squad"
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
11
 
12
- def answer_question(pdf_file, question):
13
- # Convert PDF to text
14
- pdf_data = pdf_file.read()
15
- pdf_stream = io.BytesIO(pdf_data)
16
- response = requests.post(
17
- 'https://pdftotext.com/ExtractText',
18
- files={'pdffile': pdf_stream},
19
- data={'form': 'pdftotext'}
20
- )
21
- text = response.text.strip()
22
-
23
- # Tokenize question and text
24
- input_ids = tokenizer.encode(question, text)
25
-
26
- # Perform question answering
27
- outputs = model(torch.tensor([input_ids]), return_dict=True)
28
- answer_start = outputs.start_logits.argmax().item()
29
- answer_end = outputs.end_logits.argmax().item()
30
- answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end+1]))
31
-
32
- return answer
 
 
33
 
34
  inputs = [
35
  gr.inputs.File(label="PDF document"),
36
- gr.inputs.Textbox(label="Question")
37
  ]
38
 
39
- outputs = gr.outputs.Textbox(label="Answer")
40
 
41
- gr.Interface(fn=answer_question, inputs=inputs, outputs=outputs, title="PDF Question Answering Tool",
42
- description="Upload a PDF document and ask a question. The app will use a pre-trained model to find the answer.").launch()
 
1
  import gradio as gr
2
+ import PyPDF2
3
  import io
4
+ import requests
5
+ import torch
6
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering
7
 
8
  # Download and load pre-trained model and tokenizer
 
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
12
 
13
+ def answer_questions(pdf_file, questions):
14
+ # Load PDF file and extract text
15
+ pdf_reader = PyPDF2.PdfFileReader(io.BytesIO(pdf_file.read()))
16
+ text = ""
17
+ for i in range(pdf_reader.getNumPages()):
18
+ page = pdf_reader.getPage(i)
19
+ text += page.extractText()
20
+ text = text.strip()
21
+
22
+ answers = []
23
+ for question in questions:
24
+ # Tokenize question and text
25
+ input_ids = tokenizer.encode(question, text)
26
+
27
+ # Perform question answering
28
+ outputs = model(torch.tensor([input_ids]), return_dict=True)
29
+ answer_start = outputs.start_logits.argmax().item()
30
+ answer_end = outputs.end_logits.argmax().item()
31
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end+1]))
32
+
33
+ answers.append(answer)
34
+
35
+ return answers
36
 
37
  inputs = [
38
  gr.inputs.File(label="PDF document"),
39
+ gr.inputs.Textbox(label="Questions (one per line)", type="textarea")
40
  ]
41
 
42
+ outputs = gr.outputs.Textarea(label="Answers")
43
 
44
+ gr.Interface(fn=answer_questions, inputs=inputs, outputs=outputs, title="PDF Question Answering Tool",
45
+ description="Upload a PDF document and ask multiple questions. The app will use a pre-trained model to find the answers.").launch()