Rehan3024 commited on
Commit
9d464a5
·
verified ·
1 Parent(s): 7a42243

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
3
+ from sentence_transformers import SentenceTransformer
4
+ import fitz # PyMuPDF
5
+ import os
6
+
7
+ # Load the models
8
+ summarization_model_name = 'facebook/bart-large-cnn'
9
+ tokenizer = AutoTokenizer.from_pretrained(summarization_model_name)
10
+ summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)
11
+
12
+ qa_model_name = 'distilbert-base-uncased-distilled-squad'
13
+ qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
14
+ qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
15
+ qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_tokenizer)
16
+
17
+ # Function to extract text from a PDF file
18
+ def extract_text_from_pdf(file):
19
+ doc = fitz.open(file)
20
+ text = ""
21
+ for page in doc:
22
+ text += page.get_text()
23
+ return text
24
+
25
+ # Function to summarize document
26
+ def summarize_document(document):
27
+ inputs = tokenizer(document, return_tensors='pt', max_length=1024, truncation=True)
28
+ summary_ids = summarization_model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
29
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
30
+
31
+ # Streamlit app
32
+ st.title("PDF Summarizer and Q&A")
33
+ st.write("Upload a PDF file to get a summary and ask questions about the content.")
34
+
35
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
36
+
37
+ if uploaded_file is not None:
38
+ # Save the uploaded file to a temporary location
39
+ with open("temp.pdf", "wb") as f:
40
+ f.write(uploaded_file.getbuffer())
41
+
42
+ # Extract text from the PDF
43
+ document_text = extract_text_from_pdf("temp.pdf")
44
+
45
+ # Display the extracted text
46
+ st.write("Extracted Text:")
47
+ st.write(document_text)
48
+
49
+ if st.button("Summarize"):
50
+ with st.spinner('Summarizing...'):
51
+ summary = summarize_document(document_text)
52
+ st.write("**Summary:**")
53
+ st.write(summary)
54
+
55
+ question = st.text_input("Ask a question about the document")
56
+
57
+ if st.button("Get Answer"):
58
+ if question:
59
+ with st.spinner('Generating answer...'):
60
+ answer = qa_pipeline({'question': question, 'context': document_text})
61
+ st.write("**Answer:**")
62
+ st.write(answer['answer'])
63
+ else:
64
+ st.write("Please enter a question.")
65
+
66
+ # Remove temporary file after use
67
+ if os.path.exists("temp.pdf"):
68
+ os.remove("temp.pdf")