Spaces:
Running
Running
# This is to demonstrate the core logic for the project | |
# 1. Get the link to PDF | |
# 2. Read the content of the PDF | |
# 3. Iterate: | |
# 3.1 Create a chunk (set of pages) | |
# 3.2 Create summary by combining partial summary & chunk | |
### 1. Import the libraries | |
import streamlit as st | |
import time | |
import os | |
from dotenv import load_dotenv | |
from langchain.prompts import PromptTemplate | |
# from langchain_community.llms import HuggingFaceHub | |
from langchain_community.llms import HuggingFaceEndpoint | |
from langchain_community.document_loaders import PyPDFLoader | |
# This is to simplify local development | |
# Without this you will need to copy/paste the API key with every change | |
try: | |
# CHANGE the location of the file | |
load_dotenv('C:\\Users\\raj\\.jupyter\\.env1') | |
# Add the API key to the session - use it for populating the interface | |
if os.getenv('HUGGINGFACEHUB_API_TOKEN'): | |
st.session_state['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN') | |
else: | |
st.session_state['HUGGINGFACEHUB_API_TOKEN'] = '' | |
except: | |
print("Environment file not found !! Copy & paste your HuggingFace API key.") | |
# Prompt to be used | |
template = """ | |
extend the abstractive summary below with the new content. Keep total size of the extended summary around 3000 words. | |
summary: | |
{summary} | |
new content: | |
{content} | |
extended summary: | |
""" | |
prompt_template = PromptTemplate( | |
input_variables = ['summary', 'content'], | |
template = template | |
) | |
# Model for summarization | |
model_id = 'mistralai/Mistral-7B-Instruct-v0.2' | |
CONTEXT_WINDOW_SIZE=32000 | |
MAX_TOKENS=2000 | |
if 'SUMMARY' not in st.session_state: | |
st.session_state['SUMMARY'] = '' | |
if 'HUGGINGFACEHUB_API_TOKEN' not in st.session_state: | |
st.session_state['HUGGINGFACEHUB_API_TOKEN'] = '' | |
# function to generate the summary | |
def generate_summary(): | |
# Create an LLM | |
llm = HuggingFaceEndpoint( | |
repo_id=model_id, | |
max_new_tokens=MAX_TOKENS, | |
huggingfacehub_api_token = hugging_face_api_key | |
) | |
# Show spinner, while we are waiting for the response | |
with st.spinner('Invoking LLM ... '): | |
# 1. Load the PDF file | |
partial_summary = '' | |
loader = PyPDFLoader(pdf_link) | |
pages = loader.load() | |
page_count = len(pages) | |
print("Number of pages = ", page_count) | |
# 2. Iterate to generate the summary | |
next_page_index = 0 | |
while next_page_index < len(pages): | |
'Processing chunk, starting with page index : ',next_page_index | |
# Holds the chunk = a set of contenated pages | |
new_content = '' | |
# Loop to create chunk | |
for i, doc in enumerate(pages[next_page_index : ]): | |
last_i = i | |
if len(partial_summary) + len(new_content) + len(doc.page_content) + MAX_TOKENS < CONTEXT_WINDOW_SIZE : | |
new_content = new_content + doc.page_content | |
else: | |
break | |
# Initialize the new content and next page index | |
next_page_index = next_page_index + last_i + 1 | |
# Pass the current summary and new content to LLM for summarization | |
query = prompt_template.format(summary=partial_summary, content=new_content) | |
partial_summary = llm.invoke(query) | |
st.session_state['SUMMARY'] = partial_summary | |
# Title | |
st.title('PDF Summarizer') | |
if 'HUGGINGFACEHUB_API_TOKEN' in st.session_state: | |
hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',value=st.session_state['HUGGINGFACEHUB_API_TOKEN']) | |
else: | |
hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',placeholder='copy & paste your API key') | |
# draw the box for query | |
pdf_link = st.text_input('Link to PDF document', placeholder='copy/paste link to the PDF', value='https://sgp.fas.org/crs/misc/R47644.pdf') | |
# button | |
st.button("Generate sumary", on_click=generate_summary) | |
st.text_area('Response', value = st.session_state['SUMMARY'], height=800) | |