Spaces:

acloudfan
/

pdf-summarizer

Running

App Files Files Community

pdf-summarizer / app.py

acloudfan

Upload 2 files

5c1dd5d verified about 1 year ago

raw

history blame contribute delete

4.12 kB

	# This is to demonstrate the core logic for the project

	# 1. Get the link to PDF
	# 2. Read the content of the PDF
	# 3. Iterate:
	# 3.1 Create a chunk (set of pages)
	# 3.2 Create summary by combining partial summary & chunk


	### 1. Import the libraries
	import streamlit as st
	import time
	import os
	from dotenv import load_dotenv

	from langchain.prompts import PromptTemplate

	# from langchain_community.llms import HuggingFaceHub
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.document_loaders import PyPDFLoader

	# This is to simplify local development
	# Without this you will need to copy/paste the API key with every change
	try:
	# CHANGE the location of the file
	load_dotenv('C:\\Users\\raj\\.jupyter\\.env1')
	# Add the API key to the session - use it for populating the interface
	if os.getenv('HUGGINGFACEHUB_API_TOKEN'):
	st.session_state['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
	else:
	st.session_state['HUGGINGFACEHUB_API_TOKEN'] = ''
	except:
	print("Environment file not found !! Copy & paste your HuggingFace API key.")


	# Prompt to be used
	template = """
	extend the abstractive summary below with the new content. Keep total size of the extended summary around 3000 words.

	summary:
	{summary}

	new content:
	{content}

	extended summary:

	"""

	prompt_template = PromptTemplate(
	input_variables = ['summary', 'content'],
	template = template
	)

	# Model for summarization
	model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
	CONTEXT_WINDOW_SIZE=32000
	MAX_TOKENS=2000


	if 'SUMMARY' not in st.session_state:
	st.session_state['SUMMARY'] = ''

	if 'HUGGINGFACEHUB_API_TOKEN' not in st.session_state:
	st.session_state['HUGGINGFACEHUB_API_TOKEN'] = ''


	# function to generate the summary
	def generate_summary():

	# Create an LLM
	llm = HuggingFaceEndpoint(
	repo_id=model_id,
	max_new_tokens=MAX_TOKENS,
	huggingfacehub_api_token = hugging_face_api_key
	)

	# Show spinner, while we are waiting for the response
	with st.spinner('Invoking LLM ... '):
	# 1. Load the PDF file
	partial_summary = ''
	loader = PyPDFLoader(pdf_link)
	pages = loader.load()
	page_count = len(pages)
	print("Number of pages = ", page_count)

	# 2. Iterate to generate the summary

	next_page_index = 0
	while next_page_index < len(pages):
	'Processing chunk, starting with page index : ',next_page_index

	# Holds the chunk = a set of contenated pages
	new_content = ''

	# Loop to create chunk
	for i, doc in enumerate(pages[next_page_index : ]):
	last_i = i
	if len(partial_summary) + len(new_content) + len(doc.page_content) + MAX_TOKENS < CONTEXT_WINDOW_SIZE :
	new_content = new_content + doc.page_content
	else:
	break

	# Initialize the new content and next page index
	next_page_index = next_page_index + last_i + 1

	# Pass the current summary and new content to LLM for summarization
	query = prompt_template.format(summary=partial_summary, content=new_content)



	partial_summary = llm.invoke(query)
	st.session_state['SUMMARY'] = partial_summary


	# Title
	st.title('PDF Summarizer')

	if 'HUGGINGFACEHUB_API_TOKEN' in st.session_state:
	hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',value=st.session_state['HUGGINGFACEHUB_API_TOKEN'])
	else:
	hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',placeholder='copy & paste your API key')


	# draw the box for query
	pdf_link = st.text_input('Link to PDF document', placeholder='copy/paste link to the PDF', value='https://sgp.fas.org/crs/misc/R47644.pdf')

	# button
	st.button("Generate sumary", on_click=generate_summary)


	st.text_area('Response', value = st.session_state['SUMMARY'], height=800)