Spaces:
Runtime error
Runtime error
import src.constants as constants_utils | |
import src.data_loader as data_loader_utils | |
import src.utils as utils | |
from langchain.llms import OpenAI | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain.docstore.document import Document | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
import openai | |
from langchain.vectorstores import Chroma | |
import chromadb | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain.chains.qa_with_sources import load_qa_with_sources_chain | |
from langchain.prompts import PromptTemplate | |
from llama_index import GPTVectorStoreIndex, GPTListIndex | |
from langchain.vectorstores import FAISS | |
import pickle | |
import shutil | |
from typing import Dict, List, Optional | |
import pandas as pd | |
from datetime import datetime | |
import os | |
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY') | |
import logging | |
logging.basicConfig( | |
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", | |
level=logging.INFO, | |
datefmt="%Y-%m-%d %H:%M:%S" | |
) | |
logger = logging.getLogger(__name__) | |
import warnings | |
warnings.filterwarnings('ignore') | |
class LANGCHAIN_UTILS: | |
def __init__(self, | |
index_type=constants_utils.INDEX_TYPE, | |
load_from_existing_index_store=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE | |
): | |
self.index_type = index_type | |
self.load_from_existing_index_store = load_from_existing_index_store | |
# Temporary index in the current context for the doc_type in consideration | |
self.index = None | |
# Master index which contains data from multiple sources (PDF, Online PDF, Text files, URLs, etc. It gets updated on Uploading the data from new files/urls without downtime of the application on-demand.) | |
self.master_index = None | |
# Data source wise index | |
self.index_category_doc_type_wise_index = dict( | |
(ic, dict( | |
(ds, None) for ds in list(constants_utils.DATA_SOURCES.values())) | |
) for ic in constants_utils.INDEX_CATEGORY) | |
# Initialize master index for each INDEX_CATEGORY | |
for ic in constants_utils.INDEX_CATEGORY: | |
self.index_category_doc_type_wise_index[ic][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None | |
# Data loaded as a Document format in the current context for the doc_type in consideration | |
self.documents = [] | |
# Instantiate data_loader_utils class object | |
self.data_loader_utils_obj = data_loader_utils.DATA_LOADER() | |
# Instantiate UTILS class object | |
self.utils_obj = utils.UTILS() | |
# Initialize embeddings (we can also use other embeddings) | |
self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY')) | |
# Initialize LLM model | |
self.llm = OpenAI( | |
temperature=0, | |
max_tokens=constants_utils.LLM_RESPONSE_MAX_TOKENS, | |
model_name=constants_utils.LLM_BASE_MODEL_NAME | |
) | |
# Global history for AgGPT widget | |
self.global_history = [ | |
{ | |
"role": "assistant", | |
"content": "Hi, I am a chatbot. I can converse in English. I can answer your questions about farming in India. Ask me anything!" | |
} | |
] | |
# Index category - doc_type wise data sources to display in widget | |
self.index_category_doc_type_wise_data_sources = {} | |
def user( | |
self, | |
user_message, | |
history | |
): | |
history = history + [[user_message, None]] | |
self.global_history = self.global_history + [{"role": "user", "content": user_message}] | |
return "", history | |
def get_chatgpt_response( | |
self, | |
history | |
): | |
output = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=history) | |
history.append({"role": "assistant", "content": output.choices[0].message.content}) | |
return output.choices[0].message.content, history | |
def bot( | |
self, | |
history | |
): | |
response, self.global_history = self.get_chatgpt_response(self.global_history) | |
history[-1][1] = response | |
return history | |
def clear_history( | |
self, | |
lang="English" | |
): | |
self.global_history = [{"role": "assistant", "content": "Hi, I am a chatbot. I can converse in {}. I can answer your questions about farming in India. Ask me anything!".format(lang)}] | |
return None | |
def generate_prompt_template( | |
self, | |
prompt_type, | |
input_variables | |
): | |
prompt_template = '' | |
if prompt_type == 'summarize': | |
prompt_template = """Write a concise summary of the following: | |
{text} | |
SUMMARIZE IN ENGLISH:""" | |
elif prompt_type == 'qa': | |
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context. | |
{context} | |
Question: {question} | |
Answer in English:""" | |
# Working good, but truncated answer | |
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context. | |
{context} | |
Question: {question} | |
Answer in English:""" | |
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question comprehensively at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context. | |
{context} | |
Question: {question} | |
Answer in English:""" | |
elif prompt_type == 'weather': | |
prompt_template = """ | |
What would be the weather based on the below data: | |
{text} | |
""" | |
PROMPT = PromptTemplate(template=prompt_template, input_variables=input_variables) | |
return PROMPT | |
def get_textual_summary( | |
self, | |
text, | |
chain_type="stuff", | |
custom_prompt=True, | |
prompt_type='summarize' | |
): | |
texts = [text] | |
docs = [Document(page_content=t) for t in texts[:3]] | |
if custom_prompt: | |
PROMPT = self.generate_prompt_template( | |
prompt_type=prompt_type, | |
input_variables=["text"] | |
) | |
chain = load_summarize_chain(self.llm, chain_type=chain_type, prompt=PROMPT) | |
else: | |
chain = load_summarize_chain(self.llm, chain_type=chain_type) | |
text_summary = chain.run(docs) | |
return text_summary | |
def get_weather_forecast_summary( | |
self, | |
text, | |
chain_type="stuff" | |
): | |
text = f""" | |
What would be the weather based on the below data: | |
{text} | |
Give simple response without technical numbers which can be explained to human. | |
""" | |
texts = [text] | |
docs = [Document(page_content=t) for t in texts[:3]] | |
chain = load_summarize_chain(self.llm, chain_type=chain_type) | |
text_summary = chain.run(docs) | |
return text_summary | |
def get_answer_from_para( | |
self, | |
para, | |
question, | |
chain_type="stuff", | |
custom_prompt=True, | |
prompt_type='qa' | |
): | |
# Prepare data (Split paragraph into chunks of small documents) | |
text_splitter = CharacterTextSplitter( | |
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE, | |
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP, | |
separator=constants_utils.TEXT_SPLITTER_SEPARATOR | |
) | |
texts = text_splitter.split_text(para) | |
if self.index_type == 'FAISS': | |
# Find similar docs that are relevant to the question | |
docsearch = FAISS.from_texts( | |
texts, self.embeddings, | |
metadatas=[{"source": str(i+1)} for i in range(len(texts))] | |
) | |
elif self.index_type == 'Chroma': | |
# Find similar docs that are relevant to the question | |
docsearch = Chroma.from_texts( | |
texts, self.embeddings, | |
metadatas=[{"source": str(i+1)} for i in range(len(texts))] | |
) | |
# Search for the similar docs | |
docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K) | |
# Create a Chain for question answering | |
if custom_prompt: | |
PROMPT = self.generate_prompt_template( | |
prompt_type=prompt_type, | |
input_variables=["context", "question"] | |
) | |
chain = load_qa_chain(self.llm, chain_type=chain_type, prompt=PROMPT) | |
else: | |
# chain = load_qa_with_sources_chain(self.llm, chain_type=chain_type) | |
chain = load_qa_chain(self.llm, chain_type=chain_type) | |
# chain.run(input_documents=docs, question=question) | |
out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True) | |
return out_dict['output_text'] | |
def load_documents( | |
self, | |
doc_type, | |
doc_filepath='', | |
urls=[] | |
): | |
""" | |
Load data in Document format of the given doc_type from either doc_filepath or list of urls. | |
It can load multiple files/urls in one shot. | |
Args: | |
doc_type: can be any of [pdf, online_pdf, urls, textfile] | |
doc_filepath: can be a directory or a filepath | |
urls: list of urls | |
""" | |
logger.info(f'Loading {doc_type} data into Documents format') | |
if doc_type == 'pdf': | |
# Load data from PDFs stored in local directory | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_pdf( | |
doc_filepath=doc_filepath, | |
doc_type=doc_type | |
)) | |
elif doc_type == 'online_pdf': | |
# Load data from PDFs stored in local directory | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_pdf( | |
urls=urls, | |
doc_type=doc_type | |
)) | |
elif doc_type == 'urls': | |
# Load data from URLs | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_urls( | |
urls=urls, | |
doc_type=doc_type | |
)) | |
elif doc_type == 'textfile': | |
# Load data from text files & Convert texts into Document format | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_text( | |
doc_filepath=doc_filepath, | |
doc_type=doc_type | |
)) | |
elif doc_type == 'directory': | |
# Load data from local directory | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_directory( | |
doc_filepath=doc_filepath, | |
doc_type=doc_type | |
)) | |
logger.info(f'{doc_type} data into Documents format loaded successfully!') | |
def create_index( | |
self | |
): | |
if not self.documents: | |
logger.warning(f'Empty documents. Index cannot be created!') | |
return None | |
logger.info(f'Creating index') | |
text_splitter = CharacterTextSplitter( | |
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE, | |
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP, | |
separator=constants_utils.TEXT_SPLITTER_SEPARATOR | |
) | |
self.documents = text_splitter.split_documents(self.documents) | |
############## Build the Vector store for docs ############## | |
# Vector store using Facebook AI Similarity Search | |
if self.index_type == 'FAISS': | |
self.index = FAISS.from_documents( | |
self.documents, | |
self.embeddings | |
) | |
# Vector store using Chroma DB | |
elif self.index_type == 'Chroma': | |
if not os.path.exists(self.index_filepath): | |
os.makedirs(self.index_filepath) | |
self.index = Chroma.from_documents( | |
self.documents, | |
self.embeddings, | |
persist_directory=self.index_filepath | |
) | |
# Vector store using GPT vector index | |
elif self.index_type == 'GPTVectorStoreIndex': | |
self.index = GPTVectorStoreIndex.from_documents(self.documents) | |
logger.info(f'Index created successfully!') | |
return self.index | |
def get_index_filepath( | |
self, | |
index_category, | |
doc_type | |
): | |
if doc_type == 'master': | |
self.index_filepath = os.path.join( | |
constants_utils.OUTPUT_PATH, f'index_{index_category}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}.json') | |
else: | |
self.index_filepath = os.path.join( | |
constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}.json') | |
return self.index_filepath | |
def load_master_doctype_indices_for_index_category( | |
self, | |
index_category | |
): | |
logger.info(f'Loading master and doc_type indices for: {index_category}') | |
# Set master index of index_category = None | |
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None | |
for doc_type in self.index_category_doc_type_wise_index[index_category].keys(): | |
self.index = None | |
self.index_filepath = self.get_index_filepath( | |
index_category=index_category, | |
doc_type=doc_type | |
) | |
self.load_index() | |
# Set master/doc_type index | |
self.index_category_doc_type_wise_index[index_category][doc_type] = self.index | |
logger.info(f'Master and doc_type indices for: {index_category} loaded successfully!') | |
def load_create_index( | |
self | |
): | |
logger.info(f'Loading/Creating index for each index_category') | |
for index_category in constants_utils.INDEX_CATEGORY: | |
# Load master index_category index if self.load_from_existing_index_store == True | |
if self.load_from_existing_index_store: | |
self.load_master_doctype_indices_for_index_category(index_category) | |
# For any reason, if master index is not loaded then create the new index/vector store | |
if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]: | |
logger.info(f'Creating a new Vector/Index store for: {index_category}') | |
doc_filepath = os.path.join(constants_utils.DATA_PATH, index_category) | |
urls = [] | |
# Build the Vector/Index store | |
for doc_type in list(constants_utils.DATA_SOURCES.values()): | |
logger.info(f'Creating a new Vector/Index store for: {index_category} from data source: {doc_type}') | |
index = None | |
if doc_type in ['pdf', 'textfile']: | |
index = self.create_store_index( | |
doc_type=doc_type, | |
doc_filepath=doc_filepath, | |
index_category=index_category | |
) | |
else: | |
# Build the Vector/Index store from web urls | |
index = self.create_store_index( | |
doc_type=doc_type, | |
urls=urls, | |
index_category=index_category | |
) | |
if index: | |
self.index_category_doc_type_wise_index[index_category][doc_type] = index | |
logger.info(f'New Vector/Index store for: {index_category} from data source: {doc_type} created successfully!') | |
logger.info(f'New Vector/Index store for: {index_category} created successfully!') | |
# Merge index of each doc_type into a single index_category | |
self.merge_store_master_index( | |
index_category=index_category | |
) | |
logger.info(f'Index for each index_category loaded successfully!') | |
def create_store_index( | |
self, | |
doc_type='pdf', | |
doc_filepath=constants_utils.DATA_PATH, | |
urls=[], | |
index_category=constants_utils.INDEX_CATEGORY[0] | |
): | |
logger.info(f'Creating and storing {doc_type} index') | |
self.documents = [] | |
self.index = None | |
self.index_filepath = self.get_index_filepath( | |
index_category=index_category, | |
doc_type=doc_type | |
) | |
# Delete the old index file | |
shutil.rmtree(self.index_filepath, ignore_errors=True) | |
logger.info(f'{self.index_filepath} deleted.') | |
# Load data in Documents format that can be consumed for index creation | |
self.load_documents( | |
doc_type, | |
doc_filepath, | |
urls | |
) | |
# Create the index from documents for search/retrieval | |
self.index = self.create_index() | |
# Store index | |
self.store_index( | |
index=self.index, | |
index_filepath=self.index_filepath | |
) | |
logger.info(f'{doc_type} index created and stored successfully!') | |
# Return the index of the given doc_type (this is an index for a single doc_type). Indices from multiple doc_types should be merged later on in the master index so that query could be made from a single index. | |
return self.index | |
def store_index( | |
self, | |
index, | |
index_filepath | |
): | |
if not index: | |
logger.warning(f'Cannot write an empty index to: {index_filepath}!') | |
return | |
logger.info(f'Saving index to: {index_filepath}') | |
if not os.path.exists(index_filepath) and os.path.isdir(index_filepath): | |
os.makedirs(index_filepath) | |
if self.index_type == 'FAISS': | |
index.save_local(index_filepath) | |
elif self.index_type == 'Chroma': | |
index.persist() | |
elif self.index_type == 'GPTVectorStoreIndex': | |
index.save_to_disk(index_filepath) | |
elif self.index_type == 'pickle': | |
with open(index_filepath, "wb") as f: | |
pickle.dump(index, f) | |
logger.info(f'Index saved to: {index_filepath} successfully!') | |
def load_index( | |
self | |
): | |
logger.info(f'Loading index from: {self.index_filepath}') | |
if not os.path.exists(self.index_filepath): | |
logger.warning(f"Cannot load index from {self.index_filepath} as the path doest not exist!") | |
return | |
if self.index_type == 'FAISS': | |
self.index = FAISS.load_local(self.index_filepath, self.embeddings) | |
elif self.index_type == 'Chroma': | |
self.index = Chroma( | |
persist_directory=self.index_filepath, | |
embedding_function=self.embeddings | |
) | |
elif self.index_type == 'GPTVectorStoreIndex': | |
self.index = GPTVectorStoreIndex.load_from_disk(self.index_filepath) | |
elif self.index_type == 'pickle': | |
with open(self.index_filepath, "rb") as f: | |
self.index = pickle.load(f) | |
logger.info(f'Index loaded from: {self.index_filepath} successfully!') | |
def convert_text_to_documents( | |
self, | |
text_list=[] | |
): | |
""" | |
Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store | |
""" | |
from llama_index import Document | |
documents = [Document(t) for t in text_list] | |
return documents | |
def merge_documents_from_different_sources( | |
self, | |
doc_documents, | |
url_documents | |
): | |
# Build the Vector store for docs | |
doc_index = GPTVectorStoreIndex.from_documents(doc_documents) | |
# Build the Vector store for URLs | |
url_index = GPTVectorStoreIndex.from_documents(url_documents) | |
# Set summary of each index | |
doc_index.set_text("index_from_docs") | |
url_index.set_text("index_from_urls") | |
# Merge index of different data sources | |
index = GPTListIndex([doc_index, url_index]) | |
return index | |
def merge_store_master_index( | |
self, | |
index_category | |
): | |
""" | |
Merge multiple doc_type indices into a single master index. Query/search would be performed on this merged index. | |
Args: | |
index_category: index_category (can be any of: [crops, fruits, pest_management, govt_policy, soil, etc.]) | |
""" | |
logger.info('Merging doc_type indices of different index categories into a master index') | |
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None | |
doc_type_indices = self.index_category_doc_type_wise_index[index_category] | |
if self.index_type == 'FAISS': | |
for doc_type, index in doc_type_indices.items(): | |
if doc_type == constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE: | |
# Only merge the non-master doc_type_indices | |
continue | |
if not index or not isinstance(index, FAISS): | |
logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.faiss.FAISS') | |
continue | |
if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]: | |
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = index | |
else: | |
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE].merge_from(index) | |
elif self.index_type == 'Chroma': | |
for doc_type, index in doc_type_indices.items(): | |
if not index or not isinstance(index, Chroma): | |
logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.Chroma') | |
continue | |
raise NotImplementedError | |
elif self.index_type == 'GPTVectorStoreIndex': | |
for doc_type, index in doc_type_indices.items(): | |
if not index or not isinstance(index, GPTVectorStoreIndex): | |
logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTVectorStoreIndex') | |
continue | |
raise NotImplementedError | |
# Store index_category master index | |
self.store_index( | |
index=self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE], | |
index_filepath=self.get_index_filepath( | |
index_category=index_category, | |
doc_type=constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE | |
) | |
) | |
logger.info('doc_type indices of different index categories into a master index merged successfully!') | |
def init_chromadb(self): | |
logger.info('Initializing Chroma DB') | |
if not os.path.exists(self.index_filepath): | |
os.makedirs(self.index_filepath) | |
client_settings = chromadb.config.Settings( | |
chroma_db_impl="duckdb+parquet", | |
persist_directory=self.index_filepath, | |
anonymized_telemetry=False | |
) | |
self.index = Chroma( | |
collection_name="langchain_store", | |
embedding_function=self.embeddings, | |
client_settings=client_settings, | |
persist_directory=self.index_filepath, | |
) | |
logger.info('Chroma DB initialized successfully!') | |
def query_chromadb( | |
self, | |
question, | |
k=1 | |
): | |
return self.index.similarity_search(query=question, k=k) | |
def query(self, | |
question, | |
question_category, | |
mode=constants_utils.MODE, | |
response_mode=constants_utils.RESPONSE_MODE, | |
similarity_top_k=constants_utils.SIMILARITY_TOP_K, | |
required_keywords=[], | |
exclude_keywords=[], | |
verbose=False | |
): | |
''' | |
Args: | |
mode: can be any of [default, embedding] | |
response_mode: can be any of [default, compact, tree_summarize] | |
''' | |
logger.info(f'question category: {question_category}; question: {question}') | |
response = None | |
# Get the index of the given question_category | |
index = self.index_category_doc_type_wise_index[question_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] | |
if not index: | |
logger.error(f'Index for {question_category} not found! That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.') | |
return response | |
if self.index_type == 'FAISS': | |
response = index.similarity_search( | |
question, | |
k=similarity_top_k | |
) | |
elif self.index_type == 'Chroma': | |
response = index.similarity_search( | |
question, | |
k=similarity_top_k | |
) | |
elif self.index_type == 'GPTVectorStoreIndex': | |
# Querying the index | |
response = index.query( | |
question, | |
mode=mode, | |
response_mode=response_mode, | |
similarity_top_k=similarity_top_k, | |
required_keywords=required_keywords, | |
exclude_keywords=exclude_keywords, | |
verbose=verbose | |
) | |
return response | |
def load_uploaded_documents( | |
self, | |
doc_type, | |
files_or_urls | |
): | |
logger.info(f'Loading uploaded documents from: {doc_type}') | |
if doc_type == 'pdf': | |
if not isinstance(files_or_urls, list): | |
files_or_urls = [files_or_urls] | |
for pdf in files_or_urls: | |
if not pdf.name.endswith('.pdf'): | |
logger.warning(f'Found a file other than .pdf format. Cannot load {pdf.name} file!') | |
continue | |
logger.info(f'Loading PDF from: {pdf.name}') | |
# Load PDF as documents | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_pdf( | |
doc_filepath=pdf.name, | |
doc_type=doc_type | |
) | |
) | |
elif doc_type == 'textfile': | |
if not isinstance(files_or_urls, list): | |
files_or_urls = [files_or_urls] | |
for text_file in files_or_urls: | |
if not text_file.name.endswith('.txt'): | |
logger.warning(f'Found a file other than .txt format. Cannot load {text_file.name} file!') | |
continue | |
logger.info(f'Loading textfile from: {text_file.name}') | |
# Load textfile as documents | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_text( | |
doc_filepath=text_file.name, | |
doc_type=doc_type | |
) | |
) | |
elif doc_type == 'online_pdf': | |
files_or_urls = self.utils_obj.split_text(files_or_urls) | |
# Load online_pdfs as documents | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_pdf( | |
doc_type=doc_type, | |
urls=files_or_urls | |
) | |
) | |
elif doc_type == 'urls': | |
files_or_urls = self.utils_obj.split_text(files_or_urls) | |
# Load URLs as documents | |
self.documents.extend( | |
self.data_loader_utils_obj.load_documents_from_urls( | |
doc_type=doc_type, | |
urls=files_or_urls | |
) | |
) | |
logger.info(f'Uploaded documents from: {doc_type} loaded successfully!') | |
def upload_data( | |
self, | |
doc_type, | |
files_or_urls, | |
index_category | |
): | |
logger.info(f'Uploading data for: {index_category}; from: {doc_type}') | |
self.documents = [] | |
self.index = None | |
# Create documents of the uploaded files | |
self.load_uploaded_documents( | |
doc_type, | |
files_or_urls | |
) | |
# Create the index from documents for search/retrieval | |
self.index = self.create_index() | |
# Update the existing index with the newly data | |
self.upsert_index( | |
doc_type=doc_type, | |
index_category=index_category | |
) | |
logger.info(f'{index_category}-{doc_type} data uploaded successfully!') | |
def upsert_index( | |
self, | |
doc_type, | |
index_category | |
): | |
""" | |
Updates the index of the given index_category-doc_type, if present. | |
Creates a new index if index_category-doc_type index is not present. | |
Also updates the master index for the given index_category. | |
""" | |
if not self.index: | |
return | |
logger.info(f'Upserting index for: {index_category}-{doc_type}') | |
if not self.index_category_doc_type_wise_index.get(index_category, None): | |
""" | |
If index_category index does not exists | |
Steps: | |
- set index_category index | |
- set doc_type index | |
- Store new index_category index as master | |
- Store new doc_type index | |
""" | |
logger.info(f'Master index does not exist for: {index_category}. A new {index_category} master index & {doc_type} index would be created.') | |
self.index_category_doc_type_wise_index.setdefault(index_category, {}) | |
# Set a master index only if it doesn't exist. Else keep it's value as-it-is. | |
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index | |
# Set an index for the given doc_type only if it doesn't exist. Else keep it's value as-it-is. | |
self.index_category_doc_type_wise_index[index_category][doc_type] = self.index | |
elif not self.index_category_doc_type_wise_index[index_category].get(doc_type, None): | |
""" | |
If doc_type index does not exists | |
Steps: | |
- set doc_type index | |
- if master index does not exist for the index_category - set a master index | |
- if master index exists - update the master index to merge it with doc_type index | |
- Store new/updated index_category index as master | |
- Store new doc_type index | |
""" | |
logger.info(f'{doc_type} index does not exist for: {index_category}-{doc_type}. A new {doc_type} index would be created.') | |
# create doc_type index | |
self.index_category_doc_type_wise_index[index_category][doc_type] = self.index | |
# if master index does not exist for the index_category - create a master index | |
if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None): | |
logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.') | |
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index | |
else: | |
""" | |
If the new document is of the existing index_category & doc_type | |
Steps: | |
- if master index does not exist for the index_category - set a master index | |
- if master index exists - update the master index to merge it with doc_type index | |
- update the doc_type index | |
- Store updated index_category index as master | |
- Store updated doc_type index | |
""" | |
# if master index does not exist for the index_category - create a master index | |
if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None): | |
logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.') | |
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index | |
# Merge new self.index with existing doc_type index | |
self.index_category_doc_type_wise_index[index_category][doc_type].merge_from(self.index) | |
# Update self.index to store/overwrite the existing index with the updated index | |
self.index = self.index_category_doc_type_wise_index[index_category][doc_type] | |
# Store newly created/merged index | |
self.store_index( | |
index=self.index, | |
index_filepath=self.get_index_filepath( | |
index_category=index_category, | |
doc_type=doc_type | |
) | |
) | |
# Merge and store master index for index_category | |
self.merge_store_master_index( | |
index_category=index_category | |
) | |
logger.info(f'Index for: {index_category}-{doc_type} upserted successful!') | |
def delete_index( | |
self, | |
ids: Optional[List[str]] = None, | |
# filter: Optional[DocumentMetadataFilter] = None, | |
delete_all: Optional[bool] = None, | |
): | |
""" | |
Removes vectors by ids, filter, or everything in the datastore. | |
Multiple parameters can be used at once. | |
Returns whether the operation was successful. | |
""" | |
logger.info(f'Deleting index') | |
raise NotImplementedError | |
# NOTE: we can delete a specific collection | |
self.index.delete_collection() | |
self.index.persist() | |
# Or just nuke the persist directory | |
# !rm -rf self.index_filepath | |
def get_index_category_wise_data_sources( | |
self | |
): | |
# self.index_category_doc_type_wise_data_sources | |
for index_category, doc_type in self.index_category_doc_type_wise_index.items(): | |
self.index_category_doc_type_wise_data_sources.setdefault(index_category, {}) | |
for dt in doc_type.keys(): | |
if dt == 'master': | |
continue | |
self.index_category_doc_type_wise_data_sources[index_category].setdefault(dt, set()) | |
if doc_type[dt]: | |
docs = doc_type[dt].docstore._dict | |
for doc, val in docs.items(): | |
if 'source' in val.metadata and val.metadata['source']: | |
self.index_category_doc_type_wise_data_sources[index_category][dt].add(val.metadata['source']) | |
return self.index_category_doc_type_wise_data_sources | |
def save_answer_feeback( | |
self, | |
question_category, | |
question, | |
answer, | |
feedback | |
): | |
logger.info(f'Question category: {question_category}') | |
logger.info(f'Question: {question}') | |
logger.info(f'Answer: {answer}') | |
logger.info(f'Answer feedback is: {feedback}') | |
feedback_filepath = os.path.join( | |
constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK, | |
f'{constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX}_{question_category}.tsv' | |
) | |
if os.path.exists(feedback_filepath): | |
df = pd.read_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR) | |
else: | |
df = pd.DataFrame(columns=['question_category', 'question', 'answer', 'feedback', 'timestamp']) | |
# Append answer feedback to df | |
df.loc[len(df)] = { | |
'question_category': question_category, | |
'question': question, | |
'answer': answer, | |
'feedback': feedback, | |
'timestamp': datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S.%f')[:-3] | |
} | |
# Save df into TSV format | |
df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True) | |
def get_sources_of_relevant_paragraphs( | |
self, | |
relevant_paragraphs | |
): | |
sources_relevant_paragraphs = [] | |
# Extract information on Source of relevant_paragraphs | |
for indx, doc in enumerate(relevant_paragraphs): | |
if 'source' in doc.metadata and 'page' in doc.metadata and doc.metadata['source'].endswith('.pdf'): | |
# Need to add +1 as PyPDFLoader sets page number from 0th-index | |
relevant_paragraphs[indx].metadata['page'] += 1 | |
sources_relevant_paragraphs = [doc.metadata for doc in relevant_paragraphs] | |
return sources_relevant_paragraphs | |
def clean_relevant_paragraphs( | |
self, | |
relevant_paragraphs | |
): | |
cleaned_relevant_paragraphs = [] | |
for doc in relevant_paragraphs: | |
cleaned_relevant_paragraphs.append(self.utils_obj.replace_newlines_and_spaces(doc.page_content)) | |
return cleaned_relevant_paragraphs | |