anitap commited on
Commit
8c7ca03
1 Parent(s): ce24913

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -37
app.py CHANGED
@@ -1,18 +1,12 @@
1
- from transformers import pipeline
2
-
3
- import fitz
4
  import gradio as gr
5
  import requests
6
  import io
7
  import re
8
- import os
9
  from PIL import Image
10
 
11
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
12
- qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
13
-
14
- os.environ["HUGGINGFACE_HUB_TOKEN"] = "ctp-hw"
15
- my_key = os.environ["HUGGINGFACE_HUB_TOKEN"]
16
 
17
  def extract_text_from_pdf(pdf_file):
18
  with fitz.open(pdf_file) as pdf:
@@ -20,13 +14,10 @@ def extract_text_from_pdf(pdf_file):
20
  for page in pdf:
21
  text += page.get_text("text")
22
 
23
- text = re.sub(r'\s+', ' ', text)
24
- text = text.strip()
25
  return text
26
 
27
- def summarize_pdf(pdf_file):
28
- text = extract_text_from_pdf(pdf_file)
29
-
30
  if len(text) > 1000:
31
  chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
32
  summary = ""
@@ -34,37 +25,39 @@ def summarize_pdf(pdf_file):
34
  summary += summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] + " "
35
  else:
36
  summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
37
-
38
  return summary
39
 
40
- def answer_question(pdf_file, question):
41
- text = extract_text_from_pdf(pdf_file)
42
- answer = qa_model(question=question, context=text)
43
- return answer['answer']
44
 
45
- API_URL = "https://api-inference.huggingface.co/models/stable-diffusion-v1-5/stable-diffusion-v1-5"
46
- headers = {"Authorization": f"Bearer {my_key}"}
 
47
 
48
- def query(payload):
49
- response = requests.post(API_URL, headers=headers, json=payload)
50
- return response.content
 
51
 
52
  def summarize_and_qa(pdf_file, question):
53
- summary = summarize_pdf(pdf_file)
54
- answer = answer_question(pdf_file, question)
55
- image_bytes = query({
56
- "inputs": answer,
57
- })
58
- image = Image.open(io.BytesIO(image_bytes))
59
- return summary, answer, image
 
 
 
 
 
60
 
61
  gr.Interface(
62
  fn=summarize_and_qa,
63
  inputs=["file", "text"],
64
- outputs=["textbox", "textbox", "image"],
65
- title="PDF Summary and Q&A",
66
- description="Upload a PDF to get a summary and answer questions based on the content. It will also give a picture to help you better understand the content."
67
- ).launch()
68
-
69
- if __name__ == "__main__":
70
- demo.launch()
 
1
+ import fitz
 
 
2
  import gradio as gr
3
  import requests
4
  import io
5
  import re
 
6
  from PIL import Image
7
 
8
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
9
+ qa_model = pipeline("question-answering", model="deepset/bert-large-uncased-whole-word-masking-squad2")
 
 
 
10
 
11
  def extract_text_from_pdf(pdf_file):
12
  with fitz.open(pdf_file) as pdf:
 
14
  for page in pdf:
15
  text += page.get_text("text")
16
 
17
+ text = re.sub(r'\s+', ' ', text).strip()
 
18
  return text
19
 
20
+ def summarize(text):
 
 
21
  if len(text) > 1000:
22
  chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
23
  summary = ""
 
25
  summary += summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] + " "
26
  else:
27
  summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
28
+
29
  return summary
30
 
31
+ # API_URL = "https://api-inference.huggingface.co/models/deepset/bert-large-uncased-whole-word-masking-squad2"
32
+ # headers = {"Authorization": f"Bearer {my_key}"}
 
 
33
 
34
+ # def query(payload):
35
+ # response = requests.post(API_URL, headers=headers, json=payload)
36
+ # return response.content
37
 
38
+ def answer_question(text, question):
39
+ response = qa_model(question=question, context=text)
40
+ answer = response['answer']
41
+ return answer
42
 
43
  def summarize_and_qa(pdf_file, question):
44
+ text = extract_text_from_pdf(pdf_file)
45
+ summary = summarize(text)
46
+ answer = answer_question(text, question)
47
+ # image_bytes = query({"inputs": answer})
48
+ # if image_bytes:
49
+ # try:
50
+ # image = Image.open(io.BytesIO(image_bytes))
51
+ # except Exception as e:
52
+ # return summary, answer, None
53
+ # else:
54
+ # image = None
55
+ return summary, answer
56
 
57
  gr.Interface(
58
  fn=summarize_and_qa,
59
  inputs=["file", "text"],
60
+ outputs=["textbox", "textbox"],
61
+ title="Understand your PDF Better",
62
+ description="Upload a PDF to get a summary. You can ask any question regardging the content of the PDF. It will also generate a picture to help you better understand the content."
63
+ ).launch(debug=True, share=True)