acloudfan commited on
Commit
1996459
·
verified ·
1 Parent(s): ad1273c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +126 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is to demonstrate the core logic for the project
2
+
3
+ # 1. Get the link to PDF
4
+ # 2. Read the content of the PDF
5
+ # 3. Iterate:
6
+ # 3.1 Create a chunk (set of pages)
7
+ # 3.2 Create summary by combining partial summary & chunk
8
+
9
+
10
+ ### 1. Import the libraries
11
+ import streamlit as st
12
+ import time
13
+ import os
14
+ from dotenv import load_dotenv
15
+
16
+ from langchain.prompts import PromptTemplate
17
+
18
+ # from langchain_community.llms import HuggingFaceHub
19
+ from langchain_community.llms import HuggingFaceEndpoint
20
+ from langchain_community.document_loaders import PyPDFLoader
21
+
22
+ # This is to simplify local development
23
+ # Without this you will need to copy/paste the API key with every change
24
+ try:
25
+ # CHANGE the location of the file
26
+ load_dotenv('C:\\Users\\raj\\.jupyter\\.env')
27
+ # Add the API key to the session - use it for populating the interface
28
+ if os.getenv('HUGGINGFACEHUB_API_TOKEN'):
29
+ st.session_state['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
30
+ except:
31
+ print("Environment file not found !! Copy & paste your HuggingFace API key.")
32
+
33
+
34
+ # Prompt to be used
35
+ template = """
36
+ extend the abstractive summary below with the new content. Keep total size of the extended summary around 3000 words.
37
+
38
+ summary:
39
+ {summary}
40
+
41
+ new content:
42
+ {content}
43
+
44
+ extended summary:
45
+
46
+ """
47
+
48
+ prompt_template = PromptTemplate(
49
+ input_variables = ['summary', 'content'],
50
+ template = template
51
+ )
52
+
53
+ # Model for summarization
54
+ model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
55
+ CONTEXT_WINDOW_SIZE=32000
56
+ MAX_TOKENS=2000
57
+
58
+
59
+ if 'SUMMARY' not in st.session_state:
60
+ st.session_state['SUMMARY'] = ''
61
+
62
+ # function to generate the summary
63
+ def generate_summary():
64
+
65
+ # Create an LLM
66
+ llm = HuggingFaceEndpoint(
67
+ repo_id=model_id,
68
+ max_new_tokens=MAX_TOKENS
69
+ )
70
+
71
+ # Show spinner, while we are waiting for the response
72
+ with st.spinner('Invoking LLM ... '):
73
+ # 1. Load the PDF file
74
+ partial_summary = ''
75
+ loader = PyPDFLoader(pdf_link)
76
+ pages = loader.load()
77
+ page_count = len(pages)
78
+ print("Number of pages = ", page_count)
79
+
80
+ # 2. Iterate to generate the summary
81
+
82
+ next_page_index = 0
83
+ while next_page_index < len(pages):
84
+ 'Processing chunk, starting with page index : ',next_page_index
85
+
86
+ # Holds the chunk = a set of contenated pages
87
+ new_content = ''
88
+
89
+ # Loop to create chunk
90
+ for i, doc in enumerate(pages[next_page_index : ]):
91
+ last_i = i
92
+ if len(partial_summary) + len(new_content) + len(doc.page_content) + MAX_TOKENS < CONTEXT_WINDOW_SIZE :
93
+ new_content = new_content + doc.page_content
94
+ else:
95
+ break
96
+
97
+ # Initialize the new content and next page index
98
+ next_page_index = next_page_index + last_i + 1
99
+
100
+ # Pass the current summary and new content to LLM for summarization
101
+ query = prompt_template.format(summary=partial_summary, content=new_content)
102
+
103
+
104
+
105
+ partial_summary = llm.invoke(query)
106
+ st.session_state['SUMMARY'] = partial_summary
107
+
108
+
109
+ # Title
110
+ st.title('PDF Summarizer')
111
+
112
+ if 'HUGGINGFACEHUB_API_TOKEN' in st.session_state:
113
+ cohere_api_key = st.sidebar.text_input('HuggingFace API key',value=st.session_state['HUGGINGFACEHUB_API_TOKEN'])
114
+ else:
115
+ cohere_api_key = st.sidebar.text_input('HuggingFace API key',placeholder='copy & paste your API key')
116
+
117
+
118
+ # draw the box for query
119
+ pdf_link = st.text_input('Link to PDF document', placeholder='copy/paste link to the PDF', value='https://sgp.fas.org/crs/misc/R47644.pdf')
120
+
121
+ # button
122
+ st.button("Generate sumary", on_click=generate_summary)
123
+
124
+
125
+ st.text_area('Response', value = st.session_state['SUMMARY'], height=800)
126
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python-dotenv
2
+ langchain