Spaces:
Sleeping
Sleeping
File size: 4,116 Bytes
1996459 5c1dd5d 1996459 5c1dd5d 1996459 8bb4143 1996459 2d16d5e 5c1dd5d 1996459 5c1dd5d 1996459 5c1dd5d 1996459 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# This is to demonstrate the core logic for the project
# 1. Get the link to PDF
# 2. Read the content of the PDF
# 3. Iterate:
# 3.1 Create a chunk (set of pages)
# 3.2 Create summary by combining partial summary & chunk
### 1. Import the libraries
import streamlit as st
import time
import os
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
# from langchain_community.llms import HuggingFaceHub
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.document_loaders import PyPDFLoader
# This is to simplify local development
# Without this you will need to copy/paste the API key with every change
try:
# CHANGE the location of the file
load_dotenv('C:\\Users\\raj\\.jupyter\\.env1')
# Add the API key to the session - use it for populating the interface
if os.getenv('HUGGINGFACEHUB_API_TOKEN'):
st.session_state['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
else:
st.session_state['HUGGINGFACEHUB_API_TOKEN'] = ''
except:
print("Environment file not found !! Copy & paste your HuggingFace API key.")
# Prompt to be used
template = """
extend the abstractive summary below with the new content. Keep total size of the extended summary around 3000 words.
summary:
{summary}
new content:
{content}
extended summary:
"""
prompt_template = PromptTemplate(
input_variables = ['summary', 'content'],
template = template
)
# Model for summarization
model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
CONTEXT_WINDOW_SIZE=32000
MAX_TOKENS=2000
if 'SUMMARY' not in st.session_state:
st.session_state['SUMMARY'] = ''
if 'HUGGINGFACEHUB_API_TOKEN' not in st.session_state:
st.session_state['HUGGINGFACEHUB_API_TOKEN'] = ''
# function to generate the summary
def generate_summary():
# Create an LLM
llm = HuggingFaceEndpoint(
repo_id=model_id,
max_new_tokens=MAX_TOKENS,
huggingfacehub_api_token = hugging_face_api_key
)
# Show spinner, while we are waiting for the response
with st.spinner('Invoking LLM ... '):
# 1. Load the PDF file
partial_summary = ''
loader = PyPDFLoader(pdf_link)
pages = loader.load()
page_count = len(pages)
print("Number of pages = ", page_count)
# 2. Iterate to generate the summary
next_page_index = 0
while next_page_index < len(pages):
'Processing chunk, starting with page index : ',next_page_index
# Holds the chunk = a set of contenated pages
new_content = ''
# Loop to create chunk
for i, doc in enumerate(pages[next_page_index : ]):
last_i = i
if len(partial_summary) + len(new_content) + len(doc.page_content) + MAX_TOKENS < CONTEXT_WINDOW_SIZE :
new_content = new_content + doc.page_content
else:
break
# Initialize the new content and next page index
next_page_index = next_page_index + last_i + 1
# Pass the current summary and new content to LLM for summarization
query = prompt_template.format(summary=partial_summary, content=new_content)
partial_summary = llm.invoke(query)
st.session_state['SUMMARY'] = partial_summary
# Title
st.title('PDF Summarizer')
if 'HUGGINGFACEHUB_API_TOKEN' in st.session_state:
hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',value=st.session_state['HUGGINGFACEHUB_API_TOKEN'])
else:
hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',placeholder='copy & paste your API key')
# draw the box for query
pdf_link = st.text_input('Link to PDF document', placeholder='copy/paste link to the PDF', value='https://sgp.fas.org/crs/misc/R47644.pdf')
# button
st.button("Generate sumary", on_click=generate_summary)
st.text_area('Response', value = st.session_state['SUMMARY'], height=800)
|