File size: 2,546 Bytes
9d464a5
d218527
9d464a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
import fitz  # PyMuPDF
import os

# Load the models
summarization_model_name = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(summarization_model_name)
summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)

qa_model_name = 'distilbert-base-uncased-distilled-squad'
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_tokenizer)

# Function to extract text from a PDF file
def extract_text_from_pdf(file):
    doc = fitz.open(file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to summarize document
def summarize_document(document):
    inputs = tokenizer(document, return_tensors='pt', max_length=1024, truncation=True)
    summary_ids = summarization_model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Streamlit app
st.title("PDF Summarizer and Q&A")
st.write("Upload a PDF file to get a summary and ask questions about the content.")

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    # Save the uploaded file to a temporary location
    with open("temp.pdf", "wb") as f:
        f.write(uploaded_file.getbuffer())
    
    # Extract text from the PDF
    document_text = extract_text_from_pdf("temp.pdf")
    
    # Display the extracted text
    st.write("Extracted Text:")
    st.write(document_text)
    
    if st.button("Summarize"):
        with st.spinner('Summarizing...'):
            summary = summarize_document(document_text)
            st.write("**Summary:**")
            st.write(summary)
    
    question = st.text_input("Ask a question about the document")
    
    if st.button("Get Answer"):
        if question:
            with st.spinner('Generating answer...'):
                answer = qa_pipeline({'question': question, 'context': document_text})
                st.write("**Answer:**")
                st.write(answer['answer'])
        else:
            st.write("Please enter a question.")

# Remove temporary file after use
if os.path.exists("temp.pdf"):
    os.remove("temp.pdf")