sxandie commited on
Commit
e899e0f
·
verified ·
1 Parent(s): d0803b5

Upload 22 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/docs/microsoft-2023.pdf filter=lfs diff=lfs merge=lfs -text
37
+ data/vectordb/processed/chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory
5
+ WORKDIR /frontend
6
+
7
+ # Copy the current directory contents into the container at /app
8
+ COPY . /frontend
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Make port 7860 available to the world outside this container
14
+ EXPOSE 7860
15
+
16
+ # Run the Gradio app when the container launches
17
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module uses Gradio to create an interactive web application for a chatbot with various features.
3
+
4
+ The application interface is organized into three rows:
5
+ 1. The first row contains a Chatbot component that simulates a conversation with a language model, along with a hidden
6
+ reference bar initially. The reference bar can be toggled using a button. The chatbot supports feedback in the form
7
+ of like and dislike icons.
8
+
9
+ 2. The second row consists of a Textbox for user input. Users can enter text or upload PDF/doc files.
10
+
11
+ 3. The third row includes buttons for submitting text, toggling the reference bar visibility, uploading PDF/doc files,
12
+ adjusting temperature for GPT responses, selecting the document type, and clearing the input.
13
+
14
+ The application processes user interactions:
15
+ - Uploaded files trigger the processing of the files, updating the input and chatbot components.
16
+ - Submitting text triggers the chatbot to respond, considering the selected document type and temperature settings.
17
+ The response is displayed in the Textbox and Chatbot components, and the reference bar may be updated.
18
+
19
+ The application can be run as a standalone script, launching the Gradio interface for users to interact with the chatbot.
20
+
21
+ Note: The docstring provides an overview of the module's purpose and functionality, but detailed comments within the code
22
+ explain specific components, interactions, and logic throughout the implementation.
23
+ """
24
+ import gradio as gr
25
+ from src.upload_file import UploadFile
26
+ from src.finbot import ChatBot
27
+ from src.ui_settings import UISettings
28
+
29
+
30
+ with gr.Blocks() as demo:
31
+ with gr.Tabs():
32
+ with gr.TabItem("FinGPT"):
33
+ # First ROW:
34
+ with gr.Row() as row_one:
35
+ with gr.Column(visible=False) as reference_bar:
36
+ ref_output = gr.Markdown()
37
+
38
+ with gr.Column() as chatbot_output:
39
+ chatbot = gr.Chatbot(
40
+ [],
41
+ elem_id="chatbot",
42
+ bubble_full_width=False,
43
+ height=500,
44
+ avatar_images=(
45
+ ("images/user.png"), "images/chatbot.png"),
46
+ )
47
+ chatbot.like(UISettings.feedback, None, None) # feedbacks
48
+ # SECOND ROW:
49
+ with gr.Row():
50
+ input_txt = gr.Textbox(
51
+ lines=4,
52
+ scale=8,
53
+ placeholder="Hi there! Have a question? Ask away! Or, upload your PDFs to find the answers within them.",
54
+ container=False,
55
+ )
56
+ model_choice = gr.Dropdown(
57
+ label="Choose model", choices=["gpt-3.5-turbo", "llama3-70b-8192", "mixtral-8x7b-32768"], value="llama3-70b-8192")
58
+
59
+ # Third ROW:
60
+ with gr.Row() as row_two:
61
+ text_submit_btn = gr.Button(value="Ask FinGPT 🤗")
62
+ sidebar_state = gr.State(False)
63
+ btn_toggle_sidebar = gr.Button(
64
+ value="References")
65
+ btn_toggle_sidebar.click(UISettings.toggle_sidebar,
66
+ [sidebar_state],
67
+ [reference_bar, sidebar_state]
68
+ )
69
+ upload_btn = gr.UploadButton(
70
+ "Upload you pdf/doc file 📄", file_types=[
71
+ '.pdf',
72
+ '.doc'
73
+ ],
74
+ file_count="multiple")
75
+ temperature_bar = gr.Slider(minimum=0, maximum=1, value=0, step=0.1,
76
+ label="Temperature", info="0: Coherent mode, 1: Creative mode")
77
+ rag_with_dropdown = gr.Dropdown(
78
+ label="RAG with", choices=["Existing database", "Upload new data"], value="Existing database")
79
+ clear_button = gr.ClearButton([input_txt, chatbot])
80
+ # Backend Process:
81
+ file_msg = upload_btn.upload(fn=UploadFile.process_uploaded_files, inputs=[
82
+ upload_btn, chatbot, rag_with_dropdown, model_choice], outputs=[input_txt, chatbot], queue=False)
83
+
84
+ txt_msg = input_txt.submit(fn=ChatBot.respond,
85
+ inputs=[chatbot,
86
+ input_txt,
87
+ rag_with_dropdown,
88
+ temperature_bar,
89
+ model_choice],
90
+ outputs=[input_txt,chatbot,
91
+ ref_output],
92
+ queue=False).then(lambda: gr.Textbox(interactive=True),
93
+ None,
94
+ [input_txt], queue=False)
95
+
96
+ txt_msg = text_submit_btn.click(fn=ChatBot.respond,
97
+ inputs=[chatbot,
98
+ input_txt,
99
+ rag_with_dropdown,
100
+ temperature_bar,
101
+ model_choice],
102
+ outputs=[input_txt,
103
+ chatbot, ref_output],
104
+ queue=False).then(lambda: gr.Textbox(interactive=True),
105
+ None, [input_txt], queue=False)
106
+
107
+
108
+ if __name__ == "__main__":
109
+ demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
configs/app_config.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ directories:
2
+ data_directory: data/docs
3
+ data_directory_2: data/docs_2
4
+ persist_directory: data/vectordb/processed/chroma/
5
+ custom_persist_directory: data/vectordb/uploaded/chroma/
6
+
7
+ embedding_model_config:
8
+ engine: "text-embedding-ada-002"
9
+ #engine: "Alibaba-NLP/gte-base-en-v1.5"
10
+
11
+ llm_config:
12
+ llm_system_role: "You are the The Best Financial Research Analyst. \
13
+ You are expert in analyzing financial statements, forecasting financial performance, \
14
+ valuing the company, assessing investment opportunities and preparing research reports. \
15
+ You will recieve a chat history, retrieved content from the vectorDB based on the user's question, and the source.\
16
+ Your task is to respond to the user's question using the information \
17
+ from the vectorDB and Chat history, without relying on your own knowledge. \
18
+ Your output should contain only your response, and if you can't find relevant context say that you don't know. \
19
+ You will receive a input prompt enclosed in triple backtics:
20
+
21
+ # Chat history:\n
22
+ [user query, response]\n\n
23
+
24
+ # Retrieved content number:\n
25
+ Content\n\n
26
+ Source\n\n
27
+
28
+ # User question:\n
29
+ New question
30
+ "
31
+ gpt_model: "gpt-3.5-turbo"
32
+ llama3_70bmodel: "llama3-70b-8192"
33
+ temperature: 0.2
34
+ max_token: 4096
35
+
36
+ splitter_config:
37
+ chunk_size: 1000
38
+ chunk_overlap: 200
39
+
40
+ # how many relevant nodes to return
41
+ retrieval_config:
42
+ k: 5
43
+ num_of_final_doc: 3 # for reranking
44
+
45
+ serve:
46
+ port: 8000
47
+
48
+ memory:
49
+ qa_pair_count: 2
50
+
51
+
52
+
53
+
54
+
data/docs/alphabet-2023.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/docs/microsoft-2023.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4a3140732d95b86a1c823487787b849e1ca70117edcde8998ba0e8b702f8fd4
3
+ size 5859293
data/vectordb/processed/chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be261a97c4be87df854852f8adaf4195b3a07901338aebcb7687820831d0150
3
+ size 20860928
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7baf11e6838661c73f08811e0be0f61a8c55173dd5f51a585720c3c65cd7fda0
3
+ size 6284000
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b690cabe62b8902c35d848b48b407cd535aa5117621502dd552e3abe932aa2a9
3
+ size 100
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5761cdbcca93f7d658d69fe10a6df3d102b5f911a28dcd53e62d221c418bbb3e
3
+ size 55974
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6834a5486bb565f823b15118f39ef0d79879f2f1451948091acb278ac86f7079
3
+ size 4000
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94968e65bad434c804d4290e2ed54697844880fbd44952bf3dcd303259faffd8
3
+ size 8148
images/chatbot.png ADDED
images/user.png ADDED
requirements.txt ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ ragatouille
3
+ langchain-groq
4
+ langchain
5
+ langchain-community
6
+ langchain-core
7
+ langchain-openai
8
+ langchain-text-splitters
9
+ aiofiles
10
+ aiohttp
11
+ aiosignal
12
+ altair
13
+ annotated-types
14
+ anyio
15
+ asgiref
16
+ attr
17
+ bcrypt
18
+ build
19
+ cachetools
20
+ certifi
21
+ charset-normalizer
22
+ chroma-hnswlib
23
+ chromadb
24
+ click
25
+ coloredlogs
26
+ contourpy
27
+ cycler
28
+ dataclasses-json
29
+ Deprecated
30
+ distro
31
+ dnspython
32
+ email_validator
33
+ fastapi
34
+ fastapi-cli
35
+ ffmpy
36
+ filelock
37
+ flatbuffers
38
+ fonttools
39
+ frozenlist
40
+ fsspec
41
+ google-auth
42
+ googleapis-common-protos
43
+ gradio_client
44
+ grpcio
45
+ h11
46
+ httpcore
47
+ httptools
48
+ httpx
49
+ huggingface-hub
50
+ humanfriendly
51
+ idna
52
+ importlib_metadata
53
+ importlib_resources
54
+ Jinja2
55
+ jsonpatch
56
+ jsonpointer
57
+ jsonschema
58
+ jsonschema-specifications
59
+ kiwisolver
60
+ kubernetes
61
+ langsmith
62
+ markdown-it-py
63
+ MarkupSafe
64
+ marshmallow
65
+ matplotlib
66
+ mdurl
67
+ mmh3
68
+ monotonic
69
+ mpmath
70
+ multidict
71
+ mypy-extensions
72
+ numpy
73
+ oauthlib
74
+ onnxruntime
75
+ openai
76
+ opentelemetry-api
77
+ opentelemetry-exporter-otlp-proto-common
78
+ opentelemetry-exporter-otlp-proto-grpc
79
+ opentelemetry-instrumentation
80
+ opentelemetry-instrumentation-asgi
81
+ opentelemetry-instrumentation-fastapi
82
+ opentelemetry-proto
83
+ opentelemetry-sdk
84
+ opentelemetry-semantic-conventions
85
+ opentelemetry-util-http
86
+ orjson
87
+ overrides
88
+ packaging
89
+ pandas
90
+ pillow
91
+ posthog
92
+ protobuf
93
+ pyasn1
94
+ pyasn1_modules
95
+ pydantic
96
+ pydantic_core
97
+ pydub
98
+ Pygments
99
+ pyparsing
100
+ pypdf
101
+ PyPika
102
+ pyproject_hooks
103
+ pyprojroot
104
+ python-dateutil
105
+ python-dotenv
106
+ python-multipart
107
+ pytz
108
+ PyYAML
109
+ referencing
110
+ regex
111
+ requests
112
+ requests-oauthlib
113
+ rich
114
+ rpds-py
115
+ rsa
116
+ ruff
117
+ semantic-version
118
+ setuptools
119
+ shellingham
120
+ six
121
+ sniffio
122
+ SQLAlchemy
123
+ starlette
124
+ sympy
125
+ tenacity
126
+ tiktoken
127
+ tokenizers
128
+ tomlkit
129
+ toolz
130
+ tqdm
131
+ typer
132
+ typing-inspect
133
+ typing_extensions
134
+ tzdata
135
+ ujson
136
+ urllib3
137
+ uvicorn
138
+ uvloop
139
+ watchfiles
140
+ websocket-client
141
+ websockets
142
+ wheel
143
+ wrapt
144
+ yarl
145
+ zipp
src/__init__.py ADDED
File without changes
src/finbot.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import ast
4
+ import html
5
+ import time
6
+ import gradio as gr
7
+ from openai import OpenAI
8
+ from typing import List, Tuple
9
+ from src.load_config import LoadConfig
10
+ from langchain_core.messages import HumanMessage, SystemMessage
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain_groq import ChatGroq
14
+ from langchain.vectorstores import Chroma
15
+
16
+ from uuid import uuid4
17
+ import os
18
+
19
+ APP_CONFIG = LoadConfig()
20
+
21
+
22
+ # URGENT NOTICE
23
+ unique_id = uuid4().hex[0:8]
24
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
25
+ os.environ["LANGCHAIN_PROJECT"] = f"Ragas_RAG_Eval"
26
+ os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
27
+
28
+ class ChatBot:
29
+ """
30
+ Class representing a chatbot with document retrieval and response generation capabilities.
31
+
32
+ This class provides static methods for responding to user queries, handling feedback, and
33
+ cleaning references from retrieved documents.
34
+ """
35
+ vectordb = None
36
+ @staticmethod
37
+ def respond(chatbot: List, message: str, data_type: str = "Existing database", temperature: float = 0.0, model_choice: str = APP_CONFIG.llama3_70bmodel) -> Tuple:
38
+ """
39
+ Generate a response to a user query using document retrieval and language model completion.
40
+
41
+ Parameters:
42
+ chatbot (List): List representing the chatbot's conversation history.
43
+ message (str): The user's query.
44
+ data_type (str): Type of data used for document retrieval ("Existing database" or "Upload new data").
45
+ temperature (float): Temperature parameter for language model completion.
46
+
47
+ Returns:
48
+ Tuple: A tuple containing an empty string, the updated chat history, and references from retrieved documents.
49
+ """
50
+
51
+ # Check if the vector database needs to be created
52
+ if ChatBot.vectordb is None:
53
+ if data_type == "Existing database":
54
+ if os.path.exists(APP_CONFIG.persist_directory):
55
+ ChatBot.vectordb = Chroma(persist_directory=APP_CONFIG.persist_directory,
56
+ embedding_function=APP_CONFIG.embedding_model)
57
+ else:
58
+ chatbot.append(
59
+ (message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module. For further information please visit README.md of this repository."))
60
+ return "", chatbot, None
61
+
62
+ elif data_type == "Upload new data":
63
+ if os.path.exists(APP_CONFIG.custom_persist_directory):
64
+ ChatBot.vectordb = Chroma(persist_directory=APP_CONFIG.custom_persist_directory,
65
+ embedding_function=APP_CONFIG.embedding_model)
66
+ else:
67
+ chatbot.append(
68
+ (message, f"No file uploaded. Please first upload your files using the 'upload' button."))
69
+ return "", chatbot, None
70
+
71
+ # single step proces for embed user query, serach in vectordb, and get retrieved docs
72
+ docs = ChatBot.vectordb.similarity_search(message, k=APP_CONFIG.k)
73
+
74
+ question = "# User new question:\n" + message
75
+ retrieved_content = ChatBot.clean_references(docs)
76
+
77
+ # Memory: previous Q-n-A pairs
78
+ chat_history = f"Chat history:\n {str(chatbot[-APP_CONFIG.qa_pair_count:])}\n\n"
79
+ prompt = f"{chat_history}{retrieved_content}{question}"
80
+ print("========================")
81
+ print(prompt)
82
+
83
+ if model_choice == "gpt-3.5-turbo":
84
+ client = OpenAI()
85
+ response = client.chat.completions.create(model=model_choice,
86
+ messages=[
87
+ {"role": "system", "content": APP_CONFIG.llm_system_role},
88
+ {"role": "user", "content": prompt}
89
+ ],
90
+ temperature=temperature)
91
+ print(f"Running {model_choice}...", response)
92
+ chatbot.append((message, response.choices[0].message.content))
93
+
94
+ else:
95
+ chat_llm = ChatGroq(
96
+ api_key = os.getenv("GROQ_API_KEY"),
97
+ model = model_choice,
98
+ temperature=APP_CONFIG.temperature
99
+ )
100
+ # Prompt template
101
+ prompt = ChatPromptTemplate.from_messages(
102
+ [
103
+ ("system", APP_CONFIG.llm_system_role),
104
+ ("human", prompt) # Directly using the message
105
+ ]
106
+ )
107
+ chain = prompt | chat_llm | StrOutputParser()
108
+ response = chain.invoke({})
109
+ print("Running {model_choice} via groq...", response)
110
+ chatbot.append((message, response))
111
+
112
+ time.sleep(2)
113
+ return "", chatbot, retrieved_content
114
+
115
+ @staticmethod
116
+ def extract_content(input_text):
117
+ begin_pattern = r"""page_content='"""
118
+ end_pattern = r"""'\s*metadata="""
119
+
120
+ between_pattern = rf'{begin_pattern}(.*?){end_pattern}'
121
+ from_end_pattern = rf"{end_pattern}(.*)"
122
+
123
+ between_match = re.search(between_pattern, input_text, re.DOTALL)
124
+ from_end_match = re.search(from_end_pattern, input_text, re.DOTALL)
125
+
126
+ between_text = between_match.group(1) if between_match else None
127
+ from_end_text = from_end_match.group(1) if from_end_match else None
128
+
129
+ return between_text, from_end_text
130
+ @staticmethod
131
+ def clean_references(documents: List,) -> str:
132
+ server_url = "http://localhost:8000"
133
+ documents = [str(x)+"\n\n" for x in documents]
134
+ markdown_documents = ""
135
+ counter = 1
136
+ for doc in documents:
137
+ content, metadata = re.match(r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
138
+ metadata = metadata.split('=', 1)[1]
139
+ metadata_dict = ast.literal_eval(metadata)
140
+ content = bytes(content, "utf-8").decode("unicode_escape")
141
+ content = re.sub(r'\\n', '\n', content)
142
+ content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
143
+ content = re.sub(r'\s+', ' ', content).strip()
144
+ content = html.unescape(content)
145
+ content = content.encode('latin1').decode('utf-8', 'ignore')
146
+
147
+ pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"
148
+ markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
149
+ f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
150
+ f"Page number: {str(metadata_dict['page'])}" + " | " +\
151
+ f"[View PDF]({pdf_url})" "\n\n"
152
+ counter += 1
153
+
154
+ return markdown_documents
src/load_config.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import openai
3
+ import os
4
+ from dotenv import load_dotenv
5
+ import yaml
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
+ from pyprojroot import here # for creating top-level directories in project without changing setwd()
8
+ import shutil
9
+
10
+ load_dotenv()
11
+
12
+
13
+ class LoadConfig:
14
+ """
15
+ A class for loading configuration settings and managing directories.
16
+
17
+ This class loads various configuration settings from the 'app_config.yml' file,
18
+ including LLM configurations, retrieval configurations, and memory configurations.
19
+ It also performs directory-related operations such as creating and removing directories.
20
+
21
+ ...
22
+
23
+ Attributes:
24
+ llm_engine : str
25
+ The language model engine specified in the configuration.
26
+ llm_system_role : str
27
+ The role of the language model system specified in the configuration.
28
+ persist_directory : str
29
+ The path to the persist directory where data is stored.
30
+ custom_persist_directory : str
31
+ The path to the custom persist directory.
32
+ embedding_model : OpenAIEmbeddings
33
+ An instance of the OpenAIEmbeddings class for language model embeddings.
34
+ data_directory : str
35
+ The path to the data directory.
36
+ k : int
37
+ The value of 'k' specified in the retrieval configuration.
38
+ embedding_model_engine : str
39
+ The engine specified in the embedding model configuration.
40
+ chunk_size : int
41
+ The chunk size specified in the splitter configuration.
42
+ chunk_overlap : int
43
+ The chunk overlap specified in the splitter configuration.
44
+ temperature : float
45
+ The temperature specified in the LLM configuration.
46
+ qa_pair_count : int
47
+ The number of question-answer pairs specified in the memory configuration.
48
+
49
+ Methods:
50
+ create_directory(directory_path):
51
+ Create a directory if it does not exist.
52
+ remove_directory(directory_path):
53
+ Removes the specified directory.
54
+ """
55
+
56
+ def __init__(self) -> None:
57
+ with open(here("configs/app_config.yml")) as cfg:
58
+ app_config = yaml.load(cfg, Loader=yaml.FullLoader)
59
+
60
+ # llm configs
61
+ self.gpt_model = app_config["llm_config"]["gpt_model"]
62
+ self.llama3_70bmodel = app_config["llm_config"]["llama3_70bmodel"]
63
+ self.llm_system_role = app_config["llm_config"]["llm_system_role"]
64
+ self.persist_directory = str(here(app_config["directories"]["persist_directory"])) # converting to string for adding in chromadb backend: self._settings.require("persist_directory") + "/chroma.sqlite3"
65
+ self.custom_persist_directory = str(here(app_config["directories"]["custom_persist_directory"]))
66
+ self.embedding_model = OpenAIEmbeddings()
67
+
68
+ # Retrieval configs
69
+ self.data_directory = app_config["directories"]["data_directory"]
70
+ self.k = app_config["retrieval_config"]["k"]
71
+ self.num_of_final_doc = app_config["retrieval_config"]["num_of_final_doc"]
72
+ self.embedding_model_engine = app_config["embedding_model_config"]["engine"]
73
+ self.chunk_size = app_config["splitter_config"]["chunk_size"]
74
+ self.chunk_overlap = app_config["splitter_config"]["chunk_overlap"]
75
+
76
+ self.temperature = app_config["llm_config"]["temperature"]
77
+
78
+ # Memory
79
+ self.qa_pair_count = app_config["memory"]["qa_pair_count"]
80
+
81
+
82
+ # Load OpenAI credentials
83
+ #self.load_openai_cfg()
84
+
85
+ # clean up the upload doc vectordb if it exists
86
+ self.create_directory(self.persist_directory)
87
+ self.remove_directory(self.custom_persist_directory)
88
+
89
+
90
+ def create_directory(self, directory_path: str):
91
+ """
92
+ Create a directory if it does not exist.
93
+
94
+ Parameters:
95
+ directory_path (str): The path of the directory to be created.
96
+ """
97
+ if not os.path.exists(directory_path):
98
+ os.makedirs(directory_path)
99
+
100
+ def remove_directory(self, directory_path: str):
101
+ """
102
+ Removes the specified directory.
103
+
104
+ Parameters:
105
+ directory_path (str): The path of the directory to be removed.
106
+
107
+ Raises:
108
+ OSError: If an error occurs during the directory removal process.
109
+
110
+ Returns:
111
+ None
112
+ """
113
+ if os.path.exists(directory_path):
114
+ try:
115
+ shutil.rmtree(directory_path)
116
+ print(
117
+ f"The directory '{directory_path}' has been successfully removed.")
118
+ except OSError as e:
119
+ print(f"Error: {e}")
120
+ else:
121
+ print(f"The directory '{directory_path}' does not exist.")
src/prepare_bgesmall_vectordb.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
7
+
8
+
9
+
10
+ class PrepareVectorDB:
11
+ """
12
+ A class for preparing and saving a VectorDB using OpenAI embeddings.
13
+
14
+ Involves process of loading documents, chunking them, and creating a VectorDB
15
+ with OpenAI embeddings. contains methods to prepare & save the vecotordb.
16
+
17
+ Parameters:
18
+ data_directory (str): Directory or list of directories containing the documents.
19
+ persist_directory (str): Directory to save the VectorDB.
20
+ embedding_model_engine (str): The engine for OpenAI embeddings.
21
+ chunk_size (int): The size of the chunks for document processing.
22
+ chunk_overlap (int): The overlap between chunks.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ data_directory: str,
28
+ persist_directory: str,
29
+ embedding_model_engine: str,
30
+ chunk_size: int,
31
+ chunk_overlap: int) -> None:
32
+
33
+ """
34
+ Initializing the PrepareVectorDB instance.
35
+
36
+ Parameters:
37
+ data_directory (str): Directory or list of directories containing the documents.
38
+ persist_directory (str): Directory to save the VectorDB.
39
+ embedding_model_engine (str): The engine for OpenAI embeddings.
40
+ chunk_size (int): The size of the chunks for document processing.
41
+ chunk_overlap (int): The overlap between chunks.
42
+ """
43
+
44
+ self.embedding_model_engine = embedding_model_engine
45
+ self.text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size=chunk_size,
47
+ chunk_overlap=chunk_overlap,
48
+ separators=[
49
+ "\n#{1,6} ",
50
+ "```\n",
51
+ "\n\\*\\*\\*+\n",
52
+ "\n---+\n",
53
+ "\n___+\n",
54
+ "\n\n",
55
+ "\n",
56
+ " ",
57
+ "",
58
+ ]
59
+ )
60
+ """choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc."""
61
+ self.data_directory = data_directory
62
+ self.persist_directory = persist_directory
63
+ self.embedding = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
64
+ model_kwargs={'device': 'cpu'},
65
+ encode_kwargs={'normalize_embeddings': True})
66
+
67
+ def __load_all_documents(self) -> List:
68
+ """
69
+ Load all documents from the specified directory or directories and
70
+ handles the documents obtained live during chat.
71
+
72
+ Returns:
73
+ List: A list of loaded documents.
74
+ """
75
+ doc_counter = 0
76
+ if isinstance(self.data_directory, list):
77
+ print("Loading the uploaded documents...")
78
+ docs = [doc for doc_dir in self.data_directory
79
+ for doc in PyPDFLoader(doc_dir).load()]
80
+ else:
81
+ print("Loading documents manually...")
82
+ document_list = os.listdir(self.data_directory)
83
+ docs = [doc for doc_name in document_list
84
+ for doc in PyPDFLoader(os.path.join(
85
+ self.data_directory, doc_name)).load()]
86
+ doc_counter = len(docs)
87
+ print(f"Number of loaded documents: {doc_counter}")
88
+ print(f"Number of pages: {len(docs)}\n\n")
89
+
90
+ return docs
91
+
92
+ def __chunk_documents(self, docs: List) -> List:
93
+ """
94
+ Chunk the loaded documents using the specified text splitter.
95
+ Parameters:
96
+ docs (List): The list of loaded documents.
97
+ Returns:
98
+ List: A list of chunked documents.
99
+ """
100
+ print("Chunking documents...")
101
+ chunked_documents = self.text_splitter.split_documents(docs)
102
+ print("Number of chunks:", len(chunked_documents), "\n\n")
103
+ return chunked_documents
104
+
105
+ def prepare_and_save_vectordb(self):
106
+ """
107
+ Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
108
+
109
+ Returns:
110
+ Chroma: The created VectorDB.
111
+ """
112
+ docs = self.__load_all_documents()
113
+ chunked_documents = self.__chunk_documents(docs)
114
+ print("Preparing vectordb...")
115
+ vectordb = Chroma.from_documents(
116
+ documents=chunked_documents,
117
+ embedding=self.embedding,
118
+ persist_directory=self.persist_directory
119
+ )
120
+ print("Vectordb created and saved!")
121
+ print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
122
+ return vectordb
123
+
src/prepare_openAIEmbeddings_vectordb.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_openai import OpenAIEmbeddings
7
+
8
+
9
+ class PrepareVectorDB:
10
+ """
11
+ A class for preparing and saving a VectorDB using OpenAI embeddings.
12
+
13
+ Involves process of loading documents, chunking them, and creating a VectorDB
14
+ with OpenAI embeddings. contains methods to prepare & save the vecotordb.
15
+
16
+ Parameters:
17
+ data_directory (str): Directory or list of directories containing the documents.
18
+ persist_directory (str): Directory to save the VectorDB.
19
+ embedding_model_engine (str): The engine for OpenAI embeddings.
20
+ chunk_size (int): The size of the chunks for document processing.
21
+ chunk_overlap (int): The overlap between chunks.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ data_directory: str,
27
+ persist_directory: str,
28
+ embedding_model_engine: str,
29
+ chunk_size: int,
30
+ chunk_overlap: int) -> None:
31
+
32
+ """
33
+ Initializing the PrepareVectorDB instance.
34
+
35
+ Parameters:
36
+ data_directory (str): Directory or list of directories containing the documents.
37
+ persist_directory (str): Directory to save the VectorDB.
38
+ embedding_model_engine (str): The engine for OpenAI embeddings.
39
+ chunk_size (int): The size of the chunks for document processing.
40
+ chunk_overlap (int): The overlap between chunks.
41
+ """
42
+
43
+ self.embedding_model_engine = embedding_model_engine
44
+ self.text_splitter = RecursiveCharacterTextSplitter(
45
+ chunk_size=chunk_size,
46
+ chunk_overlap=chunk_overlap,
47
+ separators = [
48
+ "\n#{1,6} ",
49
+ "```\n",
50
+ "\n\\*\\*\\*+\n",
51
+ "\n---+\n",
52
+ "\n___+\n",
53
+ "\n\n",
54
+ "\n",
55
+ " ",
56
+ "",
57
+ ]
58
+ )
59
+ """choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc."""
60
+ self.data_directory = data_directory
61
+ self.persist_directory = persist_directory
62
+ self.embedding = OpenAIEmbeddings()
63
+
64
+ def __load_all_documents(self) -> List:
65
+ """
66
+ Load all documents from the specified directory or directories and
67
+ handles the documents obtained live during chat.
68
+
69
+ Returns:
70
+ List: A list of loaded documents.
71
+ """
72
+ doc_counter = 0
73
+ if isinstance(self.data_directory, list):
74
+ print("Loading the uploaded documents...")
75
+ docs = [doc for doc_dir in self.data_directory
76
+ for doc in PyPDFLoader(doc_dir).load()]
77
+ else:
78
+ print("Loading documents manually...")
79
+ document_list = os.listdir(self.data_directory)
80
+ docs = [doc for doc_name in document_list
81
+ for doc in PyPDFLoader(os.path.join(
82
+ self.data_directory, doc_name)).load()]
83
+ doc_counter = len(docs)
84
+ print(f"Number of loaded documents: {doc_counter}")
85
+ print(f"Number of pages: {len(docs)}\n\n")
86
+
87
+ return docs
88
+
89
+ def __chunk_documents(self, docs: List) -> List:
90
+ """
91
+ Chunk the loaded documents using the specified text splitter.
92
+ Parameters:
93
+ docs (List): The list of loaded documents.
94
+ Returns:
95
+ List: A list of chunked documents.
96
+ """
97
+ print("Chunking documents...")
98
+ chunked_documents = self.text_splitter.split_documents(docs)
99
+ print("Number of chunks:", len(chunked_documents), "\n\n")
100
+ return chunked_documents
101
+
102
+ def prepare_and_save_vectordb(self):
103
+ """
104
+ Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
105
+
106
+ Returns:
107
+ Chroma: The created VectorDB.
108
+ """
109
+ docs = self.__load_all_documents()
110
+ chunked_documents = self.__chunk_documents(docs)
111
+ print("Preparing vectordb...")
112
+ vectordb = Chroma.from_documents(
113
+ documents=chunked_documents,
114
+ embedding=self.embedding,
115
+ persist_directory=self.persist_directory
116
+ )
117
+ print("Vectordb created and saved!")
118
+ print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
119
+ return vectordb
120
+
src/ui_settings.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ class UISettings:
5
+ """
6
+ Utility class for managing UI settings.
7
+
8
+ This class provides static methods for toggling UI components, such as a sidebar and feedback.
9
+ """
10
+ @staticmethod
11
+ def toggle_sidebar(state):
12
+ """
13
+ Toggle the visibility state of a UI component.
14
+
15
+ Parameters:
16
+ state: The current state of the UI component.
17
+
18
+ Returns:
19
+ Tuple: A tuple containing the updated UI component state and the new state.
20
+ """
21
+ state = not state
22
+ return gr.update(visible=state), state
23
+
24
+ @staticmethod
25
+ def feedback(data: gr.LikeData):
26
+ """
27
+ Process user feedback on the generated response.
28
+
29
+ Parameters:
30
+ data (gr.LikeData): Gradio LikeData object containing user feedback.
31
+ """
32
+ if data.liked:
33
+ print("You upvoted this response: " + data.value)
34
+ else:
35
+ print("You downvoted this response: " + data.value)
src/upload_data_manually.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+
4
+ #comment this if OPENAI EMbeddings are requireed
5
+ #from prepare_bgesmall_vectordb import PrepareVectorDB
6
+ from prepare_openAIEmbeddings_vectordb import PrepareVectorDB
7
+
8
+ from load_config import LoadConfig
9
+ CONFIG = LoadConfig()
10
+
11
+
12
+ def upload_data_manually() -> None:
13
+ """
14
+ Uploads data manually to the VectorDB.
15
+
16
+ This function initializes a PrepareVectorDB instance with configuration parameters
17
+ such as data_directory, persist_directory, embedding_model_engine, chunk_size,
18
+ and chunk_overlap. It then checks if the VectorDB already exists in the specified
19
+ persist_directory. If not, it calls the prepare_and_save_vectordb method to
20
+ create and save the VectorDB. If the VectorDB already exists, a message is printed
21
+ indicating its presence.
22
+
23
+ Returns:
24
+ None
25
+ """
26
+ prepare_vectordb_instance = PrepareVectorDB(
27
+ data_directory=CONFIG.data_directory,
28
+ persist_directory=CONFIG.persist_directory,
29
+ embedding_model_engine=CONFIG.embedding_model_engine,
30
+ chunk_size=CONFIG.chunk_size,
31
+ chunk_overlap=CONFIG.chunk_overlap,
32
+ )
33
+ if not len(os.listdir(CONFIG.persist_directory)) != 0:
34
+ prepare_vectordb_instance.prepare_and_save_vectordb()
35
+ else:
36
+ print(f"VectorDB already exists in {CONFIG.persist_directory}")
37
+ return None
38
+
39
+
40
+ if __name__ == "__main__":
41
+ upload_data_manually()
src/upload_file.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.prepare_openAIEmbeddings_vectordb import PrepareVectorDB
2
+ from typing import List, Tuple
3
+ from src.load_config import LoadConfig
4
+
5
+ APP_CONFIG = LoadConfig()
6
+
7
+
8
+ class UploadFile:
9
+ """
10
+ Utility class for handling file uploads and processing.
11
+
12
+ This class provides static methods for checking directories and processing uploaded files
13
+ to prepare a VectorDB.
14
+ """
15
+
16
+ @staticmethod
17
+ def process_uploaded_files(files_dir: List, chatbot: List, rag_with_dropdown: str) -> Tuple:
18
+ """
19
+ Prepares and saves a VectorDB from uploaded files.
20
+
21
+ Parameters:
22
+ files_dir (List): List of paths to the uploaded files.
23
+ chatbot: An instance of the chatbot for communication.
24
+
25
+ Returns:
26
+ Tuple: A tuple containing an empty string and the updated chatbot instance.
27
+ """
28
+ if rag_with_dropdown == "Upload docs to chat with:":
29
+ prepare_vectordb_instance = PrepareVectorDB(data_directory=files_dir,
30
+ persist_directory=APP_CONFIG.custom_persist_directory,
31
+ embedding_model_engine=APP_CONFIG.embedding_model_engine,
32
+ chunk_size=APP_CONFIG.chunk_size,
33
+ chunk_overlap=APP_CONFIG.chunk_overlap)
34
+ prepare_vectordb_instance.prepare_and_save_vectordb()
35
+ chatbot.append(
36
+ (" ", "Uploaded files are ready for querying."))
37
+ else:
38
+ chatbot.append(
39
+ (" ", "If you want to upload your own PDF, please select 'rag_with' from the dropdown."))
40
+ return "", chatbot