pdf_to_langchain_ai_gradio

Runtime error

App Files Files Community

pdf_to_langchain_ai_gradio / app.py

raghuram13

Update app.py

74f1e2b almost 2 years ago

raw

history blame contribute delete

4.05 kB

	#GRADIO INTERFACE TO CONVERT A PDF TO TEXT AND READ IT WITH LANGCHAIN AND OPEN AI ###################################
	import gradio as gr
	import PyPDF2, os, sys, random, time, shutil
	from pypdf import PdfReader
	from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
	from langchain.chat_models import ChatOpenAI
	from langchain import OpenAI
	import openai

	directory_path = '/converted_pdf_to_text'


	def extract_info(pdf_file):

	#BEGINS PDF TO TEXT SECTION ###################
	if pdf_file.name.lower().endswith('.pdf'):
	reader = PdfReader(pdf_file.name)
	pages = reader.pages
	extracted_text = [i.extract_text() for i in pages]

	#WRITING TEXT FILE TO FOLDER ##############
	directory_name = 'converted_pdf_to_text'
	if not os.path.exists(directory_name):
	os.mkdir(directory_name)
	file_name = 'document_in_txt_format.txt'
	file_path = os.path.join(directory_name, file_name)
	with open(file_path, 'w', encoding = 'UTF-8') as f:
	f.write(str(extracted_text))
	if os.path.isfile(file_path):
	print(f'{file_name} created successfully in {directory_name}.')
	else:
	print(f"{file_name} creation in {directory_name} failed.")

	#BEGINS LLM SECTION ##########
	max_input_size = 4096
	num_outputs = 500
	max_chunk_overlap = 200
	chunk_size_limit = 4000

	llm_predictor = LLMPredictor(llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', max_tokens=num_outputs))
	prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

	documents = SimpleDirectoryReader(directory_name).load_data()
	global index
	index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

	#Remove json file if it exists to make sure it's not using a previous index file as source
	if os.path.exists("index.json"):
	os.remove("index.json")
	print("The file 'index.json' has been deleted.")
	else:
	print("The file 'index.json' does not exist.")


	#Save json index to disk from current information
	index.save_to_disk('index.json')

	#Remove directory with initial text file
	#shutil.rmtree(directory_name)
	return ("Success! You can now click on the 'Knowledge bot' tab to interact with your document", extracted_text)


	def chat(chat_history, user_input):

	bot_response = index.query(user_input)
	response = ''
	#Show each letter progressively
	for letter in ''.join(bot_response.response):
	response += letter + ""
	yield chat_history + [(user_input, response)]



	messages = [{"role": "system", "content": """You are a helpful assistant. You help the reader understand documents paraphrasing, quoting and summarizing information. You follow the instructions of the user at all times"""}]

	openai.api_key = os.getenv("OPENAI_API_KEY")


	with gr.Blocks() as demo:

	gr.Markdown('<h1 style="color:blue; font-size:20px; text-align: center; justify-content: center; font-weight:bold;">Q&A bot for PDF docs. Upload your document, press the button, and wait for confirmation of success</h1>')

	with gr.Tab('Input PDF document here'):
	text_input = gr.File()
	text_output = gr.Textbox(label= "Extracted Text")
	success_output = gr.Textbox()
	text_button = gr.Button('Build the bot!')
	text_button.click(extract_info, text_input, [success_output,text_output])
	with gr.Tab('Knowledge bot'):
	chatbot = gr.Chatbot()
	message = gr.Textbox(label = 'Ask here your question about the document, then press "enter" and scroll up for response')
	message.submit(chat, [chatbot, message], chatbot)


	demo.queue().launch(debug = True)


	if __name__ == "__main__":
	demo.launch()