mishrasahil934 commited on
Commit
9dca518
·
verified ·
1 Parent(s): 0176e0e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -18
app.py CHANGED
@@ -3,25 +3,24 @@ import base64
3
  from tempfile import NamedTemporaryFile
4
  import streamlit as st
5
  from transformers import pipeline
6
- from langchain.document_loaders import PyPDFLoader
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
 
9
- # Load model directly
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
11
 
 
12
  tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
13
  base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
14
 
15
- # File loader and processing
16
- def file_preprocessing(file):
17
- loader = PyPDFLoader(file)
18
- pages = loader.load_and_split()
19
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
20
- texts = text_splitter.split_documents(pages)
21
- final_texts = ""
22
- for text in texts:
23
- final_texts += text.page_content
24
- return final_texts
25
 
26
  # LLM pipeline for summarization
27
  def llm_pipeline(input_text):
@@ -36,16 +35,18 @@ def llm_pipeline(input_text):
36
  return result[0]['summary_text']
37
 
38
  @st.cache_data
39
- # Function to display the PDF file
40
  def displayPDF(file_path):
 
41
  with open(file_path, "rb") as f:
42
  base64_pdf = base64.b64encode(f.read()).decode('utf-8')
43
  pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
44
  st.markdown(pdf_display, unsafe_allow_html=True)
45
 
 
46
  # Streamlit App
47
  def main():
48
- st.title('Content Summarizer')
49
 
50
  # PDF Upload Section
51
  uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
@@ -64,9 +65,10 @@ def main():
64
 
65
  with col2:
66
  st.info("Summarization")
67
- input_text = file_preprocessing(temp_filepath)
68
- summary = llm_pipeline(input_text)
69
- st.success(summary)
 
70
 
71
  # Text Input Section
72
  st.header("Summarize Your Text")
 
3
  from tempfile import NamedTemporaryFile
4
  import streamlit as st
5
  from transformers import pipeline
6
+ from PyPDF2 import PdfReader
 
7
 
 
8
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
 
10
+ # Load the summarization model
11
  tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
12
  base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
13
 
14
+ # Function to extract text from a PDF using PyPDF2
15
+ def extract_text_from_pdf(pdf_path):
16
+
17
+ reader = PdfReader(pdf_path)
18
+ text = ""
19
+ for page in reader.pages:
20
+ text += page.extract_text() # Extract text from each page
21
+ if not text.strip():
22
+ raise ValueError("The PDF file contains no extractable text.")
23
+ return text
24
 
25
  # LLM pipeline for summarization
26
  def llm_pipeline(input_text):
 
35
  return result[0]['summary_text']
36
 
37
  @st.cache_data
38
+ # Function to display the PDF
39
  def displayPDF(file_path):
40
+
41
  with open(file_path, "rb") as f:
42
  base64_pdf = base64.b64encode(f.read()).decode('utf-8')
43
  pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
44
  st.markdown(pdf_display, unsafe_allow_html=True)
45
 
46
+
47
  # Streamlit App
48
  def main():
49
+ st.title('PDF Content Summarizer')
50
 
51
  # PDF Upload Section
52
  uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
 
65
 
66
  with col2:
67
  st.info("Summarization")
68
+ input_text = extract_text_from_pdf(temp_filepath)
69
+ if input_text: # Proceed only if text extraction was successful
70
+ summary = llm_pipeline(input_text)
71
+ st.success(summary)
72
 
73
  # Text Input Section
74
  st.header("Summarize Your Text")