GPT-knowledge-management

Runtime error

App Files Files Community

GPT-knowledge-management / app.py

Abhilashvj

Update app.py

90b5d0f over 2 years ago

raw

history blame

11.5 kB

	import json
	import logging
	import os
	import shutil
	import sys
	import uuid
	from json import JSONDecodeError
	from pathlib import Path
	from tqdm.auto import tqdm
	import datetime
	from time import sleep
	import pandas as pd
	import pinecone
	import streamlit as st
	from annotated_text import annotation
	from haystack import Document
	from haystack.document_stores import PineconeDocumentStore
	from haystack.nodes import (
	DocxToTextConverter,
	EmbeddingRetriever,
	FARMReader,
	FileTypeClassifier,
	PDFToTextConverter,
	PreProcessor,
	TextConverter,
	)
	from haystack.pipelines import ExtractiveQAPipeline, Pipeline
	from markdown import markdown
	from sentence_transformers import SentenceTransformer

	import openai

	# get API key from top-right dropdown on OpenAI website
	openai.api_key = st.secrets["OPENAI_API_KEY"]
	index_name = "qa_demo"


	# connect to pinecone environment
	pinecone.init(
	api_key=st.secrets["pinecone_apikey"],
	environment="us-east1-gcp"
	)
	index_name = "qa-demo"

	embed_model = "text-embedding-ada-002"
	preprocessor = PreProcessor(
	clean_empty_lines=True,
	clean_whitespace=True,
	clean_header_footer=False,
	split_by="word",
	split_length=100,
	split_respect_sentence_boundary=True
	)
	file_type_classifier = FileTypeClassifier()
	text_converter = TextConverter()
	pdf_converter = PDFToTextConverter()
	docx_converter = DocxToTextConverter()

	# check if the abstractive-question-answering index exists
	if index_name not in pinecone.list_indexes():
	# create the index if it does not exist
	pinecone.create_index(
	index_name,
	dimension=1536,
	metric="cosine"
	)

	# connect to abstractive-question-answering index we created
	index = pinecone.Index(index_name)

	FILE_UPLOAD_PATH= "./data/uploads/"
	os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)

	limit = 3750

	def retrieve(query):
	res = openai.Embedding.create(
	input=[query],
	engine=embed_model
	)

	# retrieve from Pinecone
	xq = res['data'][0]['embedding']

	# get relevant contexts
	res = index.query(xq, top_k=3, include_metadata=True)
	contexts = [
	x['metadata']['text'] for x in res['matches']
	]

	# build our prompt with the retrieved contexts included
	prompt_start = (
	"Answer the question based on the context below.\n\n"+
	"Context:\n"
	)
	prompt_end = (
	f"\n\nQuestion: {query}\nAnswer:"
	)
	# append contexts until hitting limit
	for i in range(1, len(contexts)):
	if len("\n\n---\n\n".join(contexts[:i])) >= limit:
	prompt = (
	prompt_start +
	"\n\n---\n\n".join(contexts[:i-1]) +
	prompt_end
	)
	break
	elif i == len(contexts)-1:
	prompt = (
	prompt_start +
	"\n\n---\n\n".join(contexts) +
	prompt_end
	)
	return prompt, contexts


	# first let's make it simpler to get answers
	def complete(prompt):
	# query text-davinci-003
	res = openai.Completion.create(
	engine='text-davinci-003',
	prompt=prompt,
	temperature=0,
	max_tokens=400,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0,
	stop=None
	)
	return res['choices'][0]['text'].strip()

	def query(pipe, question, top_k_reader, top_k_retriever):
	# first we retrieve relevant items from Pinecone
	query_with_contexts, contexts = retrieve(question)
	return complete(query_with_contexts), contexts


	indexing_pipeline_with_classification = Pipeline()
	indexing_pipeline_with_classification.add_node(
	component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
	)
	indexing_pipeline_with_classification.add_node(
	component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
	)
	indexing_pipeline_with_classification.add_node(
	component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
	)
	indexing_pipeline_with_classification.add_node(
	component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
	)
	indexing_pipeline_with_classification.add_node(
	component=preprocessor,
	name="Preprocessor",
	inputs=["TextConverter", "PdfConverter", "DocxConverter"],
	)

	def set_state_if_absent(key, value):
	if key not in st.session_state:
	st.session_state[key] = value

	# Adjust to a question that you would like users to see in the search bar when they load the UI:
	DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
	DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")

	# Sliders
	DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
	DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))


	st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")

	# Persistent state
	set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
	set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
	set_state_if_absent("results", None)


	# Small callback to reset the interface in case the text of the question changes
	def reset_results(*args):
	st.session_state.answer = None
	st.session_state.results = None
	st.session_state.raw_json = None

	# Title
	st.write("# GPT3 and Langchain Demo")
	st.markdown(
	"""
	This demo takes its data from the documents uploaded to the Pinecone index through this app. \n
	Ask any question from the uploaded documents and Pinecone will retrieve the context for answers and GPT3 will answer them using the retrieved context. \n
	Note: do not use keywords, but full-fledged questions. The demo is not optimized to deal with keyword queries and might misunderstand you.
	""",
	unsafe_allow_html=True,
	)

	# Sidebar
	st.sidebar.header("Options")
	st.sidebar.write("## File Upload:")
	data_files = st.sidebar.file_uploader(
	"upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
	)
	ALL_FILES = []
	META_DATA = []
	for data_file in data_files:
	# Upload file
	if data_file:
	file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{data_file.name}"
	with open(file_path, "wb") as f:
	f.write(data_file.getbuffer())
	ALL_FILES.append(file_path)
	st.sidebar.write(str(data_file.name) + "    ✅ ")
	META_DATA.append({"filename":data_file.name})


	if len(ALL_FILES) > 0:
	# document_store.update_embeddings(retriever, update_existing_embeddings=False)
	docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)["documents"]
	index_name = "qa_demo"
	# we will use batches of 64
	batch_size = 64
	# docs = docs['documents']
	with st.spinner(
	"🧠    Performing indexing of uplaoded documents... \n "
	):
	for i in range(0, len(docs), batch_size):
	# find end of batch
	i_end = min(i+batch_size, len(docs))
	# extract batch
	batch = [doc.content for doc in docs[i:i_end]]
	# generate embeddings for batch
	try:
	res = openai.Embedding.create(input=texts, engine=embed_model)
	except:
	done = False
	while not done:
	sleep(5)
	try:
	res = openai.Embedding.create(input=texts, engine=embed_model)
	done = True
	except:
	pass
	embeds = [record['embedding'] for record in res['data']]
	# get metadata
	meta = [doc.meta for doc in docs[i:i_end]]
	# create unique IDs
	ids = [doc.id for doc in docs[i:i_end]]
	# add all to upsert list
	to_upsert = list(zip(ids, emb, meta))
	# upsert/insert these records to pinecone
	_ = index.upsert(vectors=to_upsert)

	# top_k_reader = st.sidebar.slider(
	# "Max. number of answers",
	# min_value=1,
	# max_value=10,
	# value=DEFAULT_NUMBER_OF_ANSWERS,
	# step=1,
	# on_change=reset_results,
	# )
	# top_k_retriever = st.sidebar.slider(
	# "Max. number of documents from retriever",
	# min_value=1,
	# max_value=10,
	# value=DEFAULT_DOCS_FROM_RETRIEVER,
	# step=1,
	# on_change=reset_results,
	# )
	# data_files = st.file_uploader(
	# "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
	# )
	# for data_file in data_files:
	# # Upload file
	# if data_file:
	# raw_json = upload_doc(data_file)

	question = st.text_input(
	value=st.session_state.question,
	max_chars=100,
	on_change=reset_results,
	label="question",
	label_visibility="hidden",
	)
	col1, col2 = st.columns(2)
	col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
	col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)

	# Run button
	run_pressed = col1.button("Run")
	if run_pressed:

	run_query = (
	run_pressed or question != st.session_state.question
	)
	# Get results for query
	if run_query and question:
	reset_results()
	st.session_state.question = question

	with st.spinner(
	"🧠    Performing neural search on documents... \n "
	):
	try:
	st.session_state.results = query(
	pipe, question, top_k_reader=None, top_k_retriever=None
	)
	except JSONDecodeError as je:
	st.error("👓    An error occurred reading the results. Is the document store working?")
	except Exception as e:
	logging.exception(e)
	if "The server is busy processing requests" in str(e) or "503" in str(e):
	st.error("🧑‍🌾    All our workers are busy! Try again later.")
	else:
	st.error(f"🐞    An error occurred during the request. {str(e)}")


	if st.session_state.results:

	st.write("## Results:")

	for result,contexts in st.session_state.results:
	# answer, context = result.answer, result.context
	# start_idx = context.find(answer)
	# end_idx = start_idx + len(answer)
	# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
	try:
	# source = f"[{result.meta['Title']}]({result.meta['link']})"
	# st.write(
	# markdown(f'Source: {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
	# unsafe_allow_html=True,
	# )
	st.write(
	markdown(f"Answer: {result} \n Extracted from context {contexts}"),
	unsafe_allow_html=True,
	)
	except:
	# filename = result.meta.get('filename', "")
	# st.write(
	# markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
	# unsafe_allow_html=True,
	# )
	st.write(
	markdown(f"Answer: {result}"),
	unsafe_allow_html=True,
	)