mishrasahil934 commited on
Commit
432a8d9
·
verified ·
1 Parent(s): d31706e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -58
app.py CHANGED
@@ -1,93 +1,123 @@
 
 
1
  import os
2
- import base64
3
- from tempfile import NamedTemporaryFile
4
  import streamlit as st
 
 
 
5
  from transformers import pipeline
6
- from PyPDF2 import PdfReader
 
 
7
 
 
8
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
 
10
- # Load the summarization model
11
  tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
12
  base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
13
 
14
- # Function to extract text from a PDF using PyPDF2
15
- def extract_text_from_pdf(pdf_path):
16
-
17
- reader = PdfReader(pdf_path)
18
- text = ""
19
- for page in reader.pages:
20
- text += page.extract_text() # Extract text from each page
21
- if not text.strip():
22
- raise ValueError("The PDF file contains no extractable text.")
23
- return text
24
-
25
- # LLM pipeline for summarization
26
- def llm_pipeline(input_text):
 
27
  pipe_sum = pipeline(
28
  'summarization',
29
- model=base_model,
30
- tokenizer=tokenizer,
31
- max_length=500,
32
- min_length=50,
33
  )
 
34
  result = pipe_sum(input_text)
35
- return result[0]['summary_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  @st.cache_data
38
- # Function to display the PDF
39
- def displayPDF(file_path):
40
-
41
- with open(file_path, "rb") as f:
42
- base64_pdf = base64.b64encode(f.read()).decode('utf-8')
43
- pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
 
 
 
 
44
  st.markdown(pdf_display, unsafe_allow_html=True)
45
 
 
 
 
 
46
 
47
- # Streamlit App
48
  def main():
49
- st.title('PDF Content Summarizer')
50
 
51
- # PDF Upload Section
52
  uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
 
53
  if uploaded_file is not None:
54
- if st.button("Summarize PDF"):
55
  col1, col2 = st.columns(2)
56
 
57
- # Save the uploaded file to a temporary location
58
- """with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
 
 
 
 
 
59
  temp_file.write(uploaded_file.read())
60
- temp_filepath = temp_file.name"""
61
 
62
  with col1:
63
  st.info("Uploaded PDF File")
64
- input_text = extract_text_from_pdf(temp_filepath)
65
-
66
 
67
  with col2:
68
- st.info("Summarization")
69
- input_text = extract_text_from_pdf(temp_filepath)
70
- if input_text: # Proceed only if text extraction was successful
71
- summary = llm_pipeline(input_text)
72
- st.success(summary)
73
-
74
- # Text Input Section
75
- st.header("Summarize Your Text")
76
- user_input = st.text_area("Enter your content here:", height=200)
77
- if st.button("Summarize Text"):
78
- if user_input.strip():
79
- col1, col2 = st.columns(2)
80
 
81
- with col1:
82
- st.info("Original Content")
83
- st.write(user_input)
84
 
 
 
 
 
 
 
 
 
85
  with col2:
86
- st.info("Summarization")
87
- summary = llm_pipeline(user_input)
88
- st.success(summary)
89
- else:
90
- st.warning("Please enter some content to summarize.")
91
 
 
 
92
  if __name__ == '__main__':
93
- main()
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
  import os
 
 
4
  import streamlit as st
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.document_loaders import PyPDFLoader,DirectoryLoader
7
+ from langchain.chains.summarize import load_summarize_chain
8
  from transformers import pipeline
9
+ import torch
10
+ import base64
11
+
12
 
13
+ # Load model directly
14
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
15
 
 
16
  tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
17
  base_model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
18
 
19
+ #file loader and processing
20
+ def file_preprocessing(file):
21
+ loader = PyPDFLoader(file)
22
+ pages = loader.load_and_split()
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
24
+ texts = text_splitter.split_documents(pages)
25
+ final_texts = ""
26
+ for text in texts:
27
+ print(text)
28
+ final_texts = final_texts + text.page_content
29
+ return final_texts
30
+
31
+ #lm pipeline
32
+ def llm_pipleline(filepath):
33
  pipe_sum = pipeline(
34
  'summarization',
35
+ model = base_model,
36
+ tokenizer = tokenizer,
37
+ max_length = 500,
38
+ min_length = 50
39
  )
40
+ input_text = file_preprocessing(filepath)
41
  result = pipe_sum(input_text)
42
+ result = result[0]['summary_text']
43
+ return result
44
+ def llm_pipleline1(ans):
45
+ pipe_sum = pipeline(
46
+ 'summarization',
47
+ model = base_model,
48
+ tokenizer = tokenizer,
49
+ max_length = 500,
50
+ min_length = 50
51
+ )
52
+ input_text =""+ ans
53
+ result = pipe_sum(input_text)
54
+ result = result[0]['summary_text']
55
+ return result
56
 
57
  @st.cache_data
58
+ # Function to display the PDF file
59
+ def displayPDF(file):
60
+ # Opening file from file path
61
+ with open(file, "rb") as f:
62
+ base_pdf = base64.b64encode(f.read()).decode('utf-8') # Corrected function name and variable
63
+
64
+ # Embedding PDF in HTML
65
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
66
+
67
+ # Displaying the file
68
  st.markdown(pdf_display, unsafe_allow_html=True)
69
 
70
+ #streamlit code
71
+ st.set_page_config(layout='wide')
72
+
73
+ import os
74
 
 
75
  def main():
76
+ st.title('Content Summarizer')
77
 
 
78
  uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
79
+
80
  if uploaded_file is not None:
81
+ if st.button("Summarize"):
82
  col1, col2 = st.columns(2)
83
 
84
+ # Ensure the directory exists
85
+ data_dir = "data"
86
+ if not os.path.exists(data_dir):
87
+ os.makedirs(data_dir)
88
+
89
+ filepath = os.path.join(data_dir, uploaded_file.name)
90
+ with open(filepath, 'wb') as temp_file:
91
  temp_file.write(uploaded_file.read())
 
92
 
93
  with col1:
94
  st.info("Uploaded PDF File")
95
+ pdf_viewer = displayPDF(filepath)
 
96
 
97
  with col2:
98
+ st.info("Summarization is below")
99
+ summary = llm_pipleline(filepath)
100
+ st.success(summary)
101
+ else:
102
+ st.warning("Please upload a valid PDF file.")
 
 
 
 
 
 
 
103
 
 
 
 
104
 
105
+ if st.button("text"):
106
+ ans = input("enter your content")
107
+ if st.button("Enter"):
108
+ col1,col2 = st.columns(2)
109
+
110
+ with col1:
111
+ st.info("what you have entered")
112
+ print(ans)
113
  with col2:
114
+ st.info("Summarization is below")
115
+ summary1=llm_pipleline1(ans)
116
+ st.success(summary1)
117
+
118
+
119
 
120
+
121
+
122
  if __name__ == '__main__':
123
+ main()