|
import fitz |
|
import gradio as gr |
|
import re |
|
from transformers import pipeline |
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
qa_model = pipeline("question-answering", model="deepset/bert-large-uncased-whole-word-masking-squad2") |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
with fitz.open(pdf_file) as pdf: |
|
text = "" |
|
for page in pdf: |
|
text += page.get_text("text") |
|
text = re.sub(r'\s+', ' ', text).strip() |
|
return text |
|
|
|
def summarize(text): |
|
if len(text) > 1000: |
|
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] |
|
summary = "" |
|
for chunk in chunks: |
|
summary += summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] + " " |
|
else: |
|
summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] |
|
return summary |
|
|
|
def answer_question(text, question): |
|
response = qa_model(question=question, context=text) |
|
answer = response['answer'] |
|
return answer |
|
|
|
def summarize_and_qa(pdf_file, question): |
|
text = extract_text_from_pdf(pdf_file) |
|
summary = summarize(text) |
|
answer = answer_question(text, question) |
|
return summary, answer |
|
|
|
gr.Interface( |
|
fn=summarize_and_qa, |
|
inputs=["file", "text"], |
|
outputs=["textbox", "textbox"], |
|
title="Understand your PDF Better", |
|
description="Upload a PDF to get a summary. You can ask any question regarding the content of the PDF." |
|
).launch(debug=True, share=True) |
|
|