Spaces:
Sleeping
Sleeping
File size: 2,939 Bytes
9d464a5 a7af8b0 9d464a5 5324b1e 9d464a5 a7af8b0 9d464a5 a7af8b0 5324b1e a7af8b0 5324b1e 9d464a5 a7af8b0 9d464a5 a7af8b0 9d464a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
import fitz # PyMuPDF
import os
import torch
# Load the models
summarization_model_name = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(summarization_model_name)
summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)
qa_model_name = 'deepset/bert-large-uncased-whole-word-masking-squad2'
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
# Function to extract text from a PDF file
def extract_text_from_pdf(file):
doc = fitz.open(file)
text = ""
for page in doc:
text += page.get_text()
return text
# Function to summarize document
def summarize_document(document):
inputs = tokenizer(document, return_tensors='pt', max_length=1024, truncation=True)
summary_ids = summarization_model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Function to get answer to question
def get_answer(question, context):
inputs = qa_tokenizer(question, context, return_tensors="pt", padding=True, truncation=True, max_length=512)
start_positions, end_positions = qa_model(**inputs)
answer_start = torch.argmax(start_positions)
answer_end = torch.argmax(end_positions) + 1
answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
return answer
# Streamlit app
st.title("PDF Summarizer and Q&A")
st.write("Upload a PDF file to get a summary and ask questions about the content.")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
# Save the uploaded file to a temporary location
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
# Extract text from the PDF
document_text = extract_text_from_pdf("temp.pdf")
# Display the extracted text
st.write("Extracted Text:")
st.write(document_text)
if st.button("Summarize"):
with st.spinner('Summarizing...'):
summary = summarize_document(document_text)
st.write("**Summary:**")
st.write(summary)
question = st.text_input("Ask a question about the document")
if st.button("Get Answer"):
if question:
with st.spinner('Generating answer...'):
answer = get_answer(question, document_text)
st.write("**Answer:**")
st.write(answer)
else:
st.write("Please enter a question.")
# Remove temporary file after use
if os.path.exists("temp.pdf"):
os.remove("temp.pdf")
|