Spaces:
Running
Running
Create app.py
Browse files
app.py
CHANGED
@@ -3,25 +3,24 @@ import base64
|
|
3 |
from tempfile import NamedTemporaryFile
|
4 |
import streamlit as st
|
5 |
from transformers import pipeline
|
6 |
-
from
|
7 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
|
9 |
-
# Load model directly
|
10 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
11 |
|
|
|
12 |
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
|
13 |
base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
|
14 |
|
15 |
-
#
|
16 |
-
def
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
return
|
25 |
|
26 |
# LLM pipeline for summarization
|
27 |
def llm_pipeline(input_text):
|
@@ -36,16 +35,18 @@ def llm_pipeline(input_text):
|
|
36 |
return result[0]['summary_text']
|
37 |
|
38 |
@st.cache_data
|
39 |
-
# Function to display the PDF
|
40 |
def displayPDF(file_path):
|
|
|
41 |
with open(file_path, "rb") as f:
|
42 |
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
43 |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
44 |
st.markdown(pdf_display, unsafe_allow_html=True)
|
45 |
|
|
|
46 |
# Streamlit App
|
47 |
def main():
|
48 |
-
st.title('Content Summarizer')
|
49 |
|
50 |
# PDF Upload Section
|
51 |
uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
|
@@ -64,9 +65,10 @@ def main():
|
|
64 |
|
65 |
with col2:
|
66 |
st.info("Summarization")
|
67 |
-
input_text =
|
68 |
-
|
69 |
-
|
|
|
70 |
|
71 |
# Text Input Section
|
72 |
st.header("Summarize Your Text")
|
|
|
3 |
from tempfile import NamedTemporaryFile
|
4 |
import streamlit as st
|
5 |
from transformers import pipeline
|
6 |
+
from PyPDF2 import PdfReader
|
|
|
7 |
|
|
|
8 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
9 |
|
10 |
+
# Load the summarization model
|
11 |
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
|
12 |
base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
|
13 |
|
14 |
+
# Function to extract text from a PDF using PyPDF2
|
15 |
+
def extract_text_from_pdf(pdf_path):
|
16 |
+
|
17 |
+
reader = PdfReader(pdf_path)
|
18 |
+
text = ""
|
19 |
+
for page in reader.pages:
|
20 |
+
text += page.extract_text() # Extract text from each page
|
21 |
+
if not text.strip():
|
22 |
+
raise ValueError("The PDF file contains no extractable text.")
|
23 |
+
return text
|
24 |
|
25 |
# LLM pipeline for summarization
|
26 |
def llm_pipeline(input_text):
|
|
|
35 |
return result[0]['summary_text']
|
36 |
|
37 |
@st.cache_data
|
38 |
+
# Function to display the PDF
|
39 |
def displayPDF(file_path):
|
40 |
+
|
41 |
with open(file_path, "rb") as f:
|
42 |
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
43 |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
44 |
st.markdown(pdf_display, unsafe_allow_html=True)
|
45 |
|
46 |
+
|
47 |
# Streamlit App
|
48 |
def main():
|
49 |
+
st.title('PDF Content Summarizer')
|
50 |
|
51 |
# PDF Upload Section
|
52 |
uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
|
|
|
65 |
|
66 |
with col2:
|
67 |
st.info("Summarization")
|
68 |
+
input_text = extract_text_from_pdf(temp_filepath)
|
69 |
+
if input_text: # Proceed only if text extraction was successful
|
70 |
+
summary = llm_pipeline(input_text)
|
71 |
+
st.success(summary)
|
72 |
|
73 |
# Text Input Section
|
74 |
st.header("Summarize Your Text")
|