Spaces:
Runtime error
Runtime error
File size: 38,977 Bytes
5273d83 3d0d57c 5273d83 f0af1c3 5273d83 f0af1c3 5273d83 f0af1c3 5273d83 3fa36c8 5273d83 ac64082 5273d83 d2ef46d 3fa36c8 d2ef46d 3fa36c8 d2ef46d 3fa36c8 d2ef46d 3fa36c8 d2ef46d 3fa36c8 d2ef46d 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 d2ef46d 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 3fa36c8 5273d83 3d0d57c 5273d83 3d0d57c 5273d83 3d0d57c 5273d83 3d0d57c 5273d83 3d0d57c 5273d83 3d0d57c 5273d83 3d0d57c 5273d83 c8e892f 5273d83 3d0d57c 5273d83 227ccaf 5273d83 ac64082 f0af1c3 35c01e8 f0af1c3 3fa36c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 |
import src.constants as constants_utils
import src.data_loader as data_loader_utils
import src.utils as utils
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
import openai
from langchain.vectorstores import Chroma
import chromadb
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.prompts import PromptTemplate
from llama_index import GPTVectorStoreIndex, GPTListIndex
from langchain.vectorstores import FAISS
import pickle
import shutil
from typing import Dict, List, Optional
import pandas as pd
from datetime import datetime
import os
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
import logging
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)
import warnings
warnings.filterwarnings('ignore')
class LANGCHAIN_UTILS:
def __init__(self,
index_type=constants_utils.INDEX_TYPE,
load_from_existing_index_store=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
):
self.index_type = index_type
self.load_from_existing_index_store = load_from_existing_index_store
# Temporary index in the current context for the doc_type in consideration
self.index = None
# Master index which contains data from multiple sources (PDF, Online PDF, Text files, URLs, etc. It gets updated on Uploading the data from new files/urls without downtime of the application on-demand.)
self.master_index = None
# Data source wise index
self.index_category_doc_type_wise_index = dict(
(ic, dict(
(ds, None) for ds in list(constants_utils.DATA_SOURCES.values()))
) for ic in constants_utils.INDEX_CATEGORY)
# Initialize master index for each INDEX_CATEGORY
for ic in constants_utils.INDEX_CATEGORY:
self.index_category_doc_type_wise_index[ic][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
# Data loaded as a Document format in the current context for the doc_type in consideration
self.documents = []
# Instantiate data_loader_utils class object
self.data_loader_utils_obj = data_loader_utils.DATA_LOADER()
# Instantiate UTILS class object
self.utils_obj = utils.UTILS()
# Initialize embeddings (we can also use other embeddings)
self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
# Initialize LLM model
self.llm = OpenAI(
temperature=0,
max_tokens=constants_utils.LLM_RESPONSE_MAX_TOKENS,
model_name=constants_utils.LLM_BASE_MODEL_NAME
)
# Global history for AgGPT widget
self.global_history = [
{
"role": "assistant",
"content": "Hi, I am a chatbot. I can converse in English. I can answer your questions about farming in India. Ask me anything!"
}
]
# Index category - doc_type wise data sources to display in widget
self.index_category_doc_type_wise_data_sources = {}
def user(
self,
user_message,
history
):
history = history + [[user_message, None]]
self.global_history = self.global_history + [{"role": "user", "content": user_message}]
return "", history
def get_chatgpt_response(
self,
history
):
output = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=history)
history.append({"role": "assistant", "content": output.choices[0].message.content})
return output.choices[0].message.content, history
def bot(
self,
history
):
response, self.global_history = self.get_chatgpt_response(self.global_history)
history[-1][1] = response
return history
def clear_history(
self,
lang="English"
):
self.global_history = [{"role": "assistant", "content": "Hi, I am a chatbot. I can converse in {}. I can answer your questions about farming in India. Ask me anything!".format(lang)}]
return None
def generate_prompt_template(
self,
prompt_type,
input_variables
):
prompt_template = ''
if prompt_type == 'summarize':
prompt_template = """Write a concise summary of the following:
{text}
SUMMARIZE IN ENGLISH:"""
elif prompt_type == 'qa':
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
{context}
Question: {question}
Answer in English:"""
# Working good, but truncated answer
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
{context}
Question: {question}
Answer in English:"""
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question comprehensively at the end. Start the answer by giving short summary and write the answer starting with Here are some of the key points:. Write each sentence separately with numbering. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
{context}
Question: {question}
Answer in English:"""
elif prompt_type == 'weather':
prompt_template = """
What would be the weather based on the below data:
{text}
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=input_variables)
return PROMPT
def get_textual_summary(
self,
text,
chain_type="stuff",
custom_prompt=True,
prompt_type='summarize'
):
texts = [text]
docs = [Document(page_content=t) for t in texts[:3]]
if custom_prompt:
PROMPT = self.generate_prompt_template(
prompt_type=prompt_type,
input_variables=["text"]
)
chain = load_summarize_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
else:
chain = load_summarize_chain(self.llm, chain_type=chain_type)
text_summary = chain.run(docs)
return text_summary
def get_weather_forecast_summary(
self,
text,
chain_type="stuff"
):
text = f"""
What would be the weather based on the below data:
{text}
Give simple response without technical numbers which can be explained to human.
"""
texts = [text]
docs = [Document(page_content=t) for t in texts[:3]]
chain = load_summarize_chain(self.llm, chain_type=chain_type)
text_summary = chain.run(docs)
return text_summary
def get_answer_from_para(
self,
para,
question,
chain_type="stuff",
custom_prompt=True,
prompt_type='qa'
):
# Prepare data (Split paragraph into chunks of small documents)
text_splitter = CharacterTextSplitter(
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
separator=constants_utils.TEXT_SPLITTER_SEPARATOR
)
texts = text_splitter.split_text(para)
if self.index_type == 'FAISS':
# Find similar docs that are relevant to the question
docsearch = FAISS.from_texts(
texts, self.embeddings,
metadatas=[{"source": str(i+1)} for i in range(len(texts))]
)
elif self.index_type == 'Chroma':
# Find similar docs that are relevant to the question
docsearch = Chroma.from_texts(
texts, self.embeddings,
metadatas=[{"source": str(i+1)} for i in range(len(texts))]
)
# Search for the similar docs
docs = docsearch.similarity_search(question, k=constants_utils.ANSWER_SIMILARITY_TOP_K)
# Create a Chain for question answering
if custom_prompt:
PROMPT = self.generate_prompt_template(
prompt_type=prompt_type,
input_variables=["context", "question"]
)
chain = load_qa_chain(self.llm, chain_type=chain_type, prompt=PROMPT)
else:
# chain = load_qa_with_sources_chain(self.llm, chain_type=chain_type)
chain = load_qa_chain(self.llm, chain_type=chain_type)
# chain.run(input_documents=docs, question=question)
out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
return out_dict['output_text']
def load_documents(
self,
doc_type,
doc_filepath='',
urls=[]
):
"""
Load data in Document format of the given doc_type from either doc_filepath or list of urls.
It can load multiple files/urls in one shot.
Args:
doc_type: can be any of [pdf, online_pdf, urls, textfile]
doc_filepath: can be a directory or a filepath
urls: list of urls
"""
logger.info(f'Loading {doc_type} data into Documents format')
if doc_type == 'pdf':
# Load data from PDFs stored in local directory
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_pdf(
doc_filepath=doc_filepath,
doc_type=doc_type
))
elif doc_type == 'online_pdf':
# Load data from PDFs stored in local directory
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_pdf(
urls=urls,
doc_type=doc_type
))
elif doc_type == 'urls':
# Load data from URLs
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_urls(
urls=urls,
doc_type=doc_type
))
elif doc_type == 'textfile':
# Load data from text files & Convert texts into Document format
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_text(
doc_filepath=doc_filepath,
doc_type=doc_type
))
elif doc_type == 'directory':
# Load data from local directory
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_directory(
doc_filepath=doc_filepath,
doc_type=doc_type
))
logger.info(f'{doc_type} data into Documents format loaded successfully!')
def create_index(
self
):
if not self.documents:
logger.warning(f'Empty documents. Index cannot be created!')
return None
logger.info(f'Creating index')
text_splitter = CharacterTextSplitter(
chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
separator=constants_utils.TEXT_SPLITTER_SEPARATOR
)
self.documents = text_splitter.split_documents(self.documents)
############## Build the Vector store for docs ##############
# Vector store using Facebook AI Similarity Search
if self.index_type == 'FAISS':
self.index = FAISS.from_documents(
self.documents,
self.embeddings
)
# Vector store using Chroma DB
elif self.index_type == 'Chroma':
if not os.path.exists(self.index_filepath):
os.makedirs(self.index_filepath)
self.index = Chroma.from_documents(
self.documents,
self.embeddings,
persist_directory=self.index_filepath
)
# Vector store using GPT vector index
elif self.index_type == 'GPTVectorStoreIndex':
self.index = GPTVectorStoreIndex.from_documents(self.documents)
logger.info(f'Index created successfully!')
return self.index
def get_index_filepath(
self,
index_category,
doc_type
):
if doc_type == 'master':
self.index_filepath = os.path.join(
constants_utils.OUTPUT_PATH, f'index_{index_category}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}.json')
else:
self.index_filepath = os.path.join(
constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}.json')
return self.index_filepath
def load_master_doctype_indices_for_index_category(
self,
index_category
):
logger.info(f'Loading master and doc_type indices for: {index_category}')
# Set master index of index_category = None
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
for doc_type in self.index_category_doc_type_wise_index[index_category].keys():
self.index = None
self.index_filepath = self.get_index_filepath(
index_category=index_category,
doc_type=doc_type
)
self.load_index()
# Set master/doc_type index
self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
logger.info(f'Master and doc_type indices for: {index_category} loaded successfully!')
def load_create_index(
self
):
logger.info(f'Loading/Creating index for each index_category')
for index_category in constants_utils.INDEX_CATEGORY:
# Load master index_category index if self.load_from_existing_index_store == True
if self.load_from_existing_index_store:
self.load_master_doctype_indices_for_index_category(index_category)
# For any reason, if master index is not loaded then create the new index/vector store
if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
logger.info(f'Creating a new Vector/Index store for: {index_category}')
doc_filepath = os.path.join(constants_utils.DATA_PATH, index_category)
urls = []
# Build the Vector/Index store
for doc_type in list(constants_utils.DATA_SOURCES.values()):
logger.info(f'Creating a new Vector/Index store for: {index_category} from data source: {doc_type}')
index = None
if doc_type in ['pdf', 'textfile']:
index = self.create_store_index(
doc_type=doc_type,
doc_filepath=doc_filepath,
index_category=index_category
)
else:
# Build the Vector/Index store from web urls
index = self.create_store_index(
doc_type=doc_type,
urls=urls,
index_category=index_category
)
if index:
self.index_category_doc_type_wise_index[index_category][doc_type] = index
logger.info(f'New Vector/Index store for: {index_category} from data source: {doc_type} created successfully!')
logger.info(f'New Vector/Index store for: {index_category} created successfully!')
# Merge index of each doc_type into a single index_category
self.merge_store_master_index(
index_category=index_category
)
logger.info(f'Index for each index_category loaded successfully!')
def create_store_index(
self,
doc_type='pdf',
doc_filepath=constants_utils.DATA_PATH,
urls=[],
index_category=constants_utils.INDEX_CATEGORY[0]
):
logger.info(f'Creating and storing {doc_type} index')
self.documents = []
self.index = None
self.index_filepath = self.get_index_filepath(
index_category=index_category,
doc_type=doc_type
)
# Delete the old index file
shutil.rmtree(self.index_filepath, ignore_errors=True)
logger.info(f'{self.index_filepath} deleted.')
# Load data in Documents format that can be consumed for index creation
self.load_documents(
doc_type,
doc_filepath,
urls
)
# Create the index from documents for search/retrieval
self.index = self.create_index()
# Store index
self.store_index(
index=self.index,
index_filepath=self.index_filepath
)
logger.info(f'{doc_type} index created and stored successfully!')
# Return the index of the given doc_type (this is an index for a single doc_type). Indices from multiple doc_types should be merged later on in the master index so that query could be made from a single index.
return self.index
def store_index(
self,
index,
index_filepath
):
if not index:
logger.warning(f'Cannot write an empty index to: {index_filepath}!')
return
logger.info(f'Saving index to: {index_filepath}')
if not os.path.exists(index_filepath) and os.path.isdir(index_filepath):
os.makedirs(index_filepath)
if self.index_type == 'FAISS':
index.save_local(index_filepath)
elif self.index_type == 'Chroma':
index.persist()
elif self.index_type == 'GPTVectorStoreIndex':
index.save_to_disk(index_filepath)
elif self.index_type == 'pickle':
with open(index_filepath, "wb") as f:
pickle.dump(index, f)
logger.info(f'Index saved to: {index_filepath} successfully!')
def load_index(
self
):
logger.info(f'Loading index from: {self.index_filepath}')
if not os.path.exists(self.index_filepath):
logger.warning(f"Cannot load index from {self.index_filepath} as the path doest not exist!")
return
if self.index_type == 'FAISS':
self.index = FAISS.load_local(self.index_filepath, self.embeddings)
elif self.index_type == 'Chroma':
self.index = Chroma(
persist_directory=self.index_filepath,
embedding_function=self.embeddings
)
elif self.index_type == 'GPTVectorStoreIndex':
self.index = GPTVectorStoreIndex.load_from_disk(self.index_filepath)
elif self.index_type == 'pickle':
with open(self.index_filepath, "rb") as f:
self.index = pickle.load(f)
logger.info(f'Index loaded from: {self.index_filepath} successfully!')
def convert_text_to_documents(
self,
text_list=[]
):
"""
Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store
"""
from llama_index import Document
documents = [Document(t) for t in text_list]
return documents
def merge_documents_from_different_sources(
self,
doc_documents,
url_documents
):
# Build the Vector store for docs
doc_index = GPTVectorStoreIndex.from_documents(doc_documents)
# Build the Vector store for URLs
url_index = GPTVectorStoreIndex.from_documents(url_documents)
# Set summary of each index
doc_index.set_text("index_from_docs")
url_index.set_text("index_from_urls")
# Merge index of different data sources
index = GPTListIndex([doc_index, url_index])
return index
def merge_store_master_index(
self,
index_category
):
"""
Merge multiple doc_type indices into a single master index. Query/search would be performed on this merged index.
Args:
index_category: index_category (can be any of: [crops, fruits, pest_management, govt_policy, soil, etc.])
"""
logger.info('Merging doc_type indices of different index categories into a master index')
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
doc_type_indices = self.index_category_doc_type_wise_index[index_category]
if self.index_type == 'FAISS':
for doc_type, index in doc_type_indices.items():
if doc_type == constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE:
# Only merge the non-master doc_type_indices
continue
if not index or not isinstance(index, FAISS):
logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.faiss.FAISS')
continue
if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = index
else:
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE].merge_from(index)
elif self.index_type == 'Chroma':
for doc_type, index in doc_type_indices.items():
if not index or not isinstance(index, Chroma):
logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.Chroma')
continue
raise NotImplementedError
elif self.index_type == 'GPTVectorStoreIndex':
for doc_type, index in doc_type_indices.items():
if not index or not isinstance(index, GPTVectorStoreIndex):
logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTVectorStoreIndex')
continue
raise NotImplementedError
# Store index_category master index
self.store_index(
index=self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE],
index_filepath=self.get_index_filepath(
index_category=index_category,
doc_type=constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE
)
)
logger.info('doc_type indices of different index categories into a master index merged successfully!')
def init_chromadb(self):
logger.info('Initializing Chroma DB')
if not os.path.exists(self.index_filepath):
os.makedirs(self.index_filepath)
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=self.index_filepath,
anonymized_telemetry=False
)
self.index = Chroma(
collection_name="langchain_store",
embedding_function=self.embeddings,
client_settings=client_settings,
persist_directory=self.index_filepath,
)
logger.info('Chroma DB initialized successfully!')
def query_chromadb(
self,
question,
k=1
):
return self.index.similarity_search(query=question, k=k)
def query(self,
question,
question_category,
mode=constants_utils.MODE,
response_mode=constants_utils.RESPONSE_MODE,
similarity_top_k=constants_utils.SIMILARITY_TOP_K,
required_keywords=[],
exclude_keywords=[],
verbose=False
):
'''
Args:
mode: can be any of [default, embedding]
response_mode: can be any of [default, compact, tree_summarize]
'''
logger.info(f'question category: {question_category}; question: {question}')
response = None
# Get the index of the given question_category
index = self.index_category_doc_type_wise_index[question_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]
if not index:
logger.error(f'Index for {question_category} not found! That means no PDFs, Text files, or URLs have been ingested and indexed so far. Ingest the new data for {question_category} and then querying again.')
return response
if self.index_type == 'FAISS':
response = index.similarity_search(
question,
k=similarity_top_k
)
elif self.index_type == 'Chroma':
response = index.similarity_search(
question,
k=similarity_top_k
)
elif self.index_type == 'GPTVectorStoreIndex':
# Querying the index
response = index.query(
question,
mode=mode,
response_mode=response_mode,
similarity_top_k=similarity_top_k,
required_keywords=required_keywords,
exclude_keywords=exclude_keywords,
verbose=verbose
)
return response
def load_uploaded_documents(
self,
doc_type,
files_or_urls
):
logger.info(f'Loading uploaded documents from: {doc_type}')
if doc_type == 'pdf':
if not isinstance(files_or_urls, list):
files_or_urls = [files_or_urls]
for pdf in files_or_urls:
if not pdf.name.endswith('.pdf'):
logger.warning(f'Found a file other than .pdf format. Cannot load {pdf.name} file!')
continue
logger.info(f'Loading PDF from: {pdf.name}')
# Load PDF as documents
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_pdf(
doc_filepath=pdf.name,
doc_type=doc_type
)
)
elif doc_type == 'textfile':
if not isinstance(files_or_urls, list):
files_or_urls = [files_or_urls]
for text_file in files_or_urls:
if not text_file.name.endswith('.txt'):
logger.warning(f'Found a file other than .txt format. Cannot load {text_file.name} file!')
continue
logger.info(f'Loading textfile from: {text_file.name}')
# Load textfile as documents
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_text(
doc_filepath=text_file.name,
doc_type=doc_type
)
)
elif doc_type == 'online_pdf':
files_or_urls = self.utils_obj.split_text(files_or_urls)
# Load online_pdfs as documents
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_pdf(
doc_type=doc_type,
urls=files_or_urls
)
)
elif doc_type == 'urls':
files_or_urls = self.utils_obj.split_text(files_or_urls)
# Load URLs as documents
self.documents.extend(
self.data_loader_utils_obj.load_documents_from_urls(
doc_type=doc_type,
urls=files_or_urls
)
)
logger.info(f'Uploaded documents from: {doc_type} loaded successfully!')
def upload_data(
self,
doc_type,
files_or_urls,
index_category
):
logger.info(f'Uploading data for: {index_category}; from: {doc_type}')
self.documents = []
self.index = None
# Create documents of the uploaded files
self.load_uploaded_documents(
doc_type,
files_or_urls
)
# Create the index from documents for search/retrieval
self.index = self.create_index()
# Update the existing index with the newly data
self.upsert_index(
doc_type=doc_type,
index_category=index_category
)
logger.info(f'{index_category}-{doc_type} data uploaded successfully!')
def upsert_index(
self,
doc_type,
index_category
):
"""
Updates the index of the given index_category-doc_type, if present.
Creates a new index if index_category-doc_type index is not present.
Also updates the master index for the given index_category.
"""
if not self.index:
return
logger.info(f'Upserting index for: {index_category}-{doc_type}')
if not self.index_category_doc_type_wise_index.get(index_category, None):
"""
If index_category index does not exists
Steps:
- set index_category index
- set doc_type index
- Store new index_category index as master
- Store new doc_type index
"""
logger.info(f'Master index does not exist for: {index_category}. A new {index_category} master index & {doc_type} index would be created.')
self.index_category_doc_type_wise_index.setdefault(index_category, {})
# Set a master index only if it doesn't exist. Else keep it's value as-it-is.
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
# Set an index for the given doc_type only if it doesn't exist. Else keep it's value as-it-is.
self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
elif not self.index_category_doc_type_wise_index[index_category].get(doc_type, None):
"""
If doc_type index does not exists
Steps:
- set doc_type index
- if master index does not exist for the index_category - set a master index
- if master index exists - update the master index to merge it with doc_type index
- Store new/updated index_category index as master
- Store new doc_type index
"""
logger.info(f'{doc_type} index does not exist for: {index_category}-{doc_type}. A new {doc_type} index would be created.')
# create doc_type index
self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
# if master index does not exist for the index_category - create a master index
if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
else:
"""
If the new document is of the existing index_category & doc_type
Steps:
- if master index does not exist for the index_category - set a master index
- if master index exists - update the master index to merge it with doc_type index
- update the doc_type index
- Store updated index_category index as master
- Store updated doc_type index
"""
# if master index does not exist for the index_category - create a master index
if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
# Merge new self.index with existing doc_type index
self.index_category_doc_type_wise_index[index_category][doc_type].merge_from(self.index)
# Update self.index to store/overwrite the existing index with the updated index
self.index = self.index_category_doc_type_wise_index[index_category][doc_type]
# Store newly created/merged index
self.store_index(
index=self.index,
index_filepath=self.get_index_filepath(
index_category=index_category,
doc_type=doc_type
)
)
# Merge and store master index for index_category
self.merge_store_master_index(
index_category=index_category
)
logger.info(f'Index for: {index_category}-{doc_type} upserted successful!')
def delete_index(
self,
ids: Optional[List[str]] = None,
# filter: Optional[DocumentMetadataFilter] = None,
delete_all: Optional[bool] = None,
):
"""
Removes vectors by ids, filter, or everything in the datastore.
Multiple parameters can be used at once.
Returns whether the operation was successful.
"""
logger.info(f'Deleting index')
raise NotImplementedError
# NOTE: we can delete a specific collection
self.index.delete_collection()
self.index.persist()
# Or just nuke the persist directory
# !rm -rf self.index_filepath
def get_index_category_wise_data_sources(
self
):
# self.index_category_doc_type_wise_data_sources
for index_category, doc_type in self.index_category_doc_type_wise_index.items():
self.index_category_doc_type_wise_data_sources.setdefault(index_category, {})
for dt in doc_type.keys():
if dt == 'master':
continue
self.index_category_doc_type_wise_data_sources[index_category].setdefault(dt, set())
if doc_type[dt]:
docs = doc_type[dt].docstore._dict
for doc, val in docs.items():
if 'source' in val.metadata and val.metadata['source']:
self.index_category_doc_type_wise_data_sources[index_category][dt].add(val.metadata['source'])
return self.index_category_doc_type_wise_data_sources
def save_answer_feeback(
self,
question_category,
question,
answer,
feedback
):
logger.info(f'Question category: {question_category}')
logger.info(f'Question: {question}')
logger.info(f'Answer: {answer}')
logger.info(f'Answer feedback is: {feedback}')
feedback_filepath = os.path.join(
constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK,
f'{constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_PREFIX}_{question_category}.tsv'
)
if os.path.exists(feedback_filepath):
df = pd.read_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR)
else:
df = pd.DataFrame(columns=['question_category', 'question', 'answer', 'feedback', 'timestamp'])
# Append answer feedback to df
df.loc[len(df)] = {
'question_category': question_category,
'question': question,
'answer': answer,
'feedback': feedback,
'timestamp': datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S.%f')[:-3]
}
# Save df into TSV format
df.to_csv(feedback_filepath, sep=constants_utils.OUTPUT_PATH_ANSWER_FEEDBACK_FILE_SAVE_SEPARATOR, index=False, header=True)
def get_sources_of_relevant_paragraphs(
self,
relevant_paragraphs
):
sources_relevant_paragraphs = []
# Extract information on Source of relevant_paragraphs
for indx, doc in enumerate(relevant_paragraphs):
if 'source' in doc.metadata and 'page' in doc.metadata and doc.metadata['source'].endswith('.pdf'):
# Need to add +1 as PyPDFLoader sets page number from 0th-index
relevant_paragraphs[indx].metadata['page'] += 1
sources_relevant_paragraphs = [doc.metadata for doc in relevant_paragraphs]
return sources_relevant_paragraphs
def clean_relevant_paragraphs(
self,
relevant_paragraphs
):
cleaned_relevant_paragraphs = []
for doc in relevant_paragraphs:
cleaned_relevant_paragraphs.append(self.utils_obj.replace_newlines_and_spaces(doc.page_content))
return cleaned_relevant_paragraphs
|