nurindahpratiwi commited on
Commit
955df66
·
1 Parent(s): 1c3b73b

first commit

Browse files
Files changed (2) hide show
  1. app.py +78 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.document_loaders import PyPDFLoader
4
+
5
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
6
+ from transformers import pipeline
7
+ import torch
8
+ import base64
9
+
10
+ access_token = st.secrets["HF_TOKEN"]
11
+
12
+ # Model and tokenizer
13
+ #model_checkpoint = "LaMini-Flan-T5-248M"
14
+ model_checkpoint = "MBZUAI/LaMini-Flan-T5-783M"
15
+ model_tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
16
+ model = T5ForConditionalGeneration.from_pretrained(model_checkpoint, device_map='auto', torch_dtype=torch.float32)
17
+
18
+ #REPO_ID = "MBZUAI/LaMini-Flan-T5-783M"
19
+ #model = pipeline(task='summarization', model=REPO_ID, token=access_token)
20
+
21
+ # File loader and preprocessing
22
+ def preprocess_pdf(file):
23
+ loader = PyPDFLoader(file)
24
+ pages = loader.load_and_split()
25
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=170, chunk_overlap=70)
26
+ texts = text_splitter.split_documents(pages)
27
+ final_text = ""
28
+ for text in texts:
29
+ final_text = final_text + text.page_content
30
+ return final_text
31
+
32
+ # Language Model pipeline
33
+ def language_model_pipeline(filepath):
34
+ summarization_pipeline = pipeline(
35
+ 'summarization',
36
+ model=model,
37
+ tokenizer=model_tokenizer,
38
+ max_length=500,
39
+ min_length=70)
40
+ input_text = preprocess_pdf(filepath)
41
+ summary_result = summarization_pipeline(input_text)
42
+ summarized_text = summary_result[0]['summary_text']
43
+ return summarized_text
44
+
45
+ @st.cache_data
46
+ # Function to display the PDF content
47
+ def display_pdf(file):
48
+ with open(file, "rb") as f:
49
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
50
+
51
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
52
+ st.markdown(pdf_display, unsafe_allow_html=True)
53
+
54
+ # Streamlit code
55
+ st.set_page_config(layout="wide")
56
+
57
+ def main():
58
+ st.title("Document Summarization App using Language Model")
59
+
60
+ uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
61
+
62
+ if uploaded_file is not None:
63
+ if st.button("Summarize"):
64
+ col1, col2 = st.columns(2)
65
+ filepath = "pdf/" + uploaded_file.name
66
+ with open(filepath, "wb") as temp_file:
67
+ temp_file.write(uploaded_file.read())
68
+ with col1:
69
+ st.info("Uploaded File")
70
+ pdf_view = display_pdf(filepath)
71
+
72
+ with col2:
73
+ summarized_result = language_model_pipeline(filepath)
74
+ st.info("Summarization Complete")
75
+ st.success(summarized_result)
76
+
77
+ if __name__ == "__main__":
78
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ sentence_transformers
3
+ torch
4
+ sentencepiece
5
+ transformers
6
+ accelerate
7
+ chromadb
8
+ pypdf
9
+ tiktoken
10
+ streamlit
11
+ fastapi
12
+ uvicorn
13
+ python-multipart
14
+ aiofiles