Spaces:
Sleeping
Sleeping
Upload 22 files
Browse files- .gitattributes +2 -0
- Dockerfile +17 -0
- app.py +109 -0
- configs/app_config.yml +54 -0
- data/docs/alphabet-2023.pdf +0 -0
- data/docs/microsoft-2023.pdf +3 -0
- data/vectordb/processed/chroma/chroma.sqlite3 +3 -0
- data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/data_level0.bin +3 -0
- data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/header.bin +3 -0
- data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/index_metadata.pickle +3 -0
- data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/length.bin +3 -0
- data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/link_lists.bin +3 -0
- images/chatbot.png +0 -0
- images/user.png +0 -0
- requirements.txt +145 -0
- src/__init__.py +0 -0
- src/finbot.py +154 -0
- src/load_config.py +121 -0
- src/prepare_bgesmall_vectordb.py +123 -0
- src/prepare_openAIEmbeddings_vectordb.py +120 -0
- src/ui_settings.py +35 -0
- src/upload_data_manually.py +41 -0
- src/upload_file.py +40 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/docs/microsoft-2023.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/vectordb/processed/chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set the working directory
|
5 |
+
WORKDIR /frontend
|
6 |
+
|
7 |
+
# Copy the current directory contents into the container at /app
|
8 |
+
COPY . /frontend
|
9 |
+
|
10 |
+
# Install any needed packages specified in requirements.txt
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Make port 7860 available to the world outside this container
|
14 |
+
EXPOSE 7860
|
15 |
+
|
16 |
+
# Run the Gradio app when the container launches
|
17 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module uses Gradio to create an interactive web application for a chatbot with various features.
|
3 |
+
|
4 |
+
The application interface is organized into three rows:
|
5 |
+
1. The first row contains a Chatbot component that simulates a conversation with a language model, along with a hidden
|
6 |
+
reference bar initially. The reference bar can be toggled using a button. The chatbot supports feedback in the form
|
7 |
+
of like and dislike icons.
|
8 |
+
|
9 |
+
2. The second row consists of a Textbox for user input. Users can enter text or upload PDF/doc files.
|
10 |
+
|
11 |
+
3. The third row includes buttons for submitting text, toggling the reference bar visibility, uploading PDF/doc files,
|
12 |
+
adjusting temperature for GPT responses, selecting the document type, and clearing the input.
|
13 |
+
|
14 |
+
The application processes user interactions:
|
15 |
+
- Uploaded files trigger the processing of the files, updating the input and chatbot components.
|
16 |
+
- Submitting text triggers the chatbot to respond, considering the selected document type and temperature settings.
|
17 |
+
The response is displayed in the Textbox and Chatbot components, and the reference bar may be updated.
|
18 |
+
|
19 |
+
The application can be run as a standalone script, launching the Gradio interface for users to interact with the chatbot.
|
20 |
+
|
21 |
+
Note: The docstring provides an overview of the module's purpose and functionality, but detailed comments within the code
|
22 |
+
explain specific components, interactions, and logic throughout the implementation.
|
23 |
+
"""
|
24 |
+
import gradio as gr
|
25 |
+
from src.upload_file import UploadFile
|
26 |
+
from src.finbot import ChatBot
|
27 |
+
from src.ui_settings import UISettings
|
28 |
+
|
29 |
+
|
30 |
+
with gr.Blocks() as demo:
|
31 |
+
with gr.Tabs():
|
32 |
+
with gr.TabItem("FinGPT"):
|
33 |
+
# First ROW:
|
34 |
+
with gr.Row() as row_one:
|
35 |
+
with gr.Column(visible=False) as reference_bar:
|
36 |
+
ref_output = gr.Markdown()
|
37 |
+
|
38 |
+
with gr.Column() as chatbot_output:
|
39 |
+
chatbot = gr.Chatbot(
|
40 |
+
[],
|
41 |
+
elem_id="chatbot",
|
42 |
+
bubble_full_width=False,
|
43 |
+
height=500,
|
44 |
+
avatar_images=(
|
45 |
+
("images/user.png"), "images/chatbot.png"),
|
46 |
+
)
|
47 |
+
chatbot.like(UISettings.feedback, None, None) # feedbacks
|
48 |
+
# SECOND ROW:
|
49 |
+
with gr.Row():
|
50 |
+
input_txt = gr.Textbox(
|
51 |
+
lines=4,
|
52 |
+
scale=8,
|
53 |
+
placeholder="Hi there! Have a question? Ask away! Or, upload your PDFs to find the answers within them.",
|
54 |
+
container=False,
|
55 |
+
)
|
56 |
+
model_choice = gr.Dropdown(
|
57 |
+
label="Choose model", choices=["gpt-3.5-turbo", "llama3-70b-8192", "mixtral-8x7b-32768"], value="llama3-70b-8192")
|
58 |
+
|
59 |
+
# Third ROW:
|
60 |
+
with gr.Row() as row_two:
|
61 |
+
text_submit_btn = gr.Button(value="Ask FinGPT 🤗")
|
62 |
+
sidebar_state = gr.State(False)
|
63 |
+
btn_toggle_sidebar = gr.Button(
|
64 |
+
value="References")
|
65 |
+
btn_toggle_sidebar.click(UISettings.toggle_sidebar,
|
66 |
+
[sidebar_state],
|
67 |
+
[reference_bar, sidebar_state]
|
68 |
+
)
|
69 |
+
upload_btn = gr.UploadButton(
|
70 |
+
"Upload you pdf/doc file 📄", file_types=[
|
71 |
+
'.pdf',
|
72 |
+
'.doc'
|
73 |
+
],
|
74 |
+
file_count="multiple")
|
75 |
+
temperature_bar = gr.Slider(minimum=0, maximum=1, value=0, step=0.1,
|
76 |
+
label="Temperature", info="0: Coherent mode, 1: Creative mode")
|
77 |
+
rag_with_dropdown = gr.Dropdown(
|
78 |
+
label="RAG with", choices=["Existing database", "Upload new data"], value="Existing database")
|
79 |
+
clear_button = gr.ClearButton([input_txt, chatbot])
|
80 |
+
# Backend Process:
|
81 |
+
file_msg = upload_btn.upload(fn=UploadFile.process_uploaded_files, inputs=[
|
82 |
+
upload_btn, chatbot, rag_with_dropdown, model_choice], outputs=[input_txt, chatbot], queue=False)
|
83 |
+
|
84 |
+
txt_msg = input_txt.submit(fn=ChatBot.respond,
|
85 |
+
inputs=[chatbot,
|
86 |
+
input_txt,
|
87 |
+
rag_with_dropdown,
|
88 |
+
temperature_bar,
|
89 |
+
model_choice],
|
90 |
+
outputs=[input_txt,chatbot,
|
91 |
+
ref_output],
|
92 |
+
queue=False).then(lambda: gr.Textbox(interactive=True),
|
93 |
+
None,
|
94 |
+
[input_txt], queue=False)
|
95 |
+
|
96 |
+
txt_msg = text_submit_btn.click(fn=ChatBot.respond,
|
97 |
+
inputs=[chatbot,
|
98 |
+
input_txt,
|
99 |
+
rag_with_dropdown,
|
100 |
+
temperature_bar,
|
101 |
+
model_choice],
|
102 |
+
outputs=[input_txt,
|
103 |
+
chatbot, ref_output],
|
104 |
+
queue=False).then(lambda: gr.Textbox(interactive=True),
|
105 |
+
None, [input_txt], queue=False)
|
106 |
+
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
|
configs/app_config.yml
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
directories:
|
2 |
+
data_directory: data/docs
|
3 |
+
data_directory_2: data/docs_2
|
4 |
+
persist_directory: data/vectordb/processed/chroma/
|
5 |
+
custom_persist_directory: data/vectordb/uploaded/chroma/
|
6 |
+
|
7 |
+
embedding_model_config:
|
8 |
+
engine: "text-embedding-ada-002"
|
9 |
+
#engine: "Alibaba-NLP/gte-base-en-v1.5"
|
10 |
+
|
11 |
+
llm_config:
|
12 |
+
llm_system_role: "You are the The Best Financial Research Analyst. \
|
13 |
+
You are expert in analyzing financial statements, forecasting financial performance, \
|
14 |
+
valuing the company, assessing investment opportunities and preparing research reports. \
|
15 |
+
You will recieve a chat history, retrieved content from the vectorDB based on the user's question, and the source.\
|
16 |
+
Your task is to respond to the user's question using the information \
|
17 |
+
from the vectorDB and Chat history, without relying on your own knowledge. \
|
18 |
+
Your output should contain only your response, and if you can't find relevant context say that you don't know. \
|
19 |
+
You will receive a input prompt enclosed in triple backtics:
|
20 |
+
|
21 |
+
# Chat history:\n
|
22 |
+
[user query, response]\n\n
|
23 |
+
|
24 |
+
# Retrieved content number:\n
|
25 |
+
Content\n\n
|
26 |
+
Source\n\n
|
27 |
+
|
28 |
+
# User question:\n
|
29 |
+
New question
|
30 |
+
"
|
31 |
+
gpt_model: "gpt-3.5-turbo"
|
32 |
+
llama3_70bmodel: "llama3-70b-8192"
|
33 |
+
temperature: 0.2
|
34 |
+
max_token: 4096
|
35 |
+
|
36 |
+
splitter_config:
|
37 |
+
chunk_size: 1000
|
38 |
+
chunk_overlap: 200
|
39 |
+
|
40 |
+
# how many relevant nodes to return
|
41 |
+
retrieval_config:
|
42 |
+
k: 5
|
43 |
+
num_of_final_doc: 3 # for reranking
|
44 |
+
|
45 |
+
serve:
|
46 |
+
port: 8000
|
47 |
+
|
48 |
+
memory:
|
49 |
+
qa_pair_count: 2
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
data/docs/alphabet-2023.pdf
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/docs/microsoft-2023.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4a3140732d95b86a1c823487787b849e1ca70117edcde8998ba0e8b702f8fd4
|
3 |
+
size 5859293
|
data/vectordb/processed/chroma/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4be261a97c4be87df854852f8adaf4195b3a07901338aebcb7687820831d0150
|
3 |
+
size 20860928
|
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7baf11e6838661c73f08811e0be0f61a8c55173dd5f51a585720c3c65cd7fda0
|
3 |
+
size 6284000
|
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b690cabe62b8902c35d848b48b407cd535aa5117621502dd552e3abe932aa2a9
|
3 |
+
size 100
|
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5761cdbcca93f7d658d69fe10a6df3d102b5f911a28dcd53e62d221c418bbb3e
|
3 |
+
size 55974
|
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6834a5486bb565f823b15118f39ef0d79879f2f1451948091acb278ac86f7079
|
3 |
+
size 4000
|
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94968e65bad434c804d4290e2ed54697844880fbd44952bf3dcd303259faffd8
|
3 |
+
size 8148
|
images/chatbot.png
ADDED
![]() |
images/user.png
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
ragatouille
|
3 |
+
langchain-groq
|
4 |
+
langchain
|
5 |
+
langchain-community
|
6 |
+
langchain-core
|
7 |
+
langchain-openai
|
8 |
+
langchain-text-splitters
|
9 |
+
aiofiles
|
10 |
+
aiohttp
|
11 |
+
aiosignal
|
12 |
+
altair
|
13 |
+
annotated-types
|
14 |
+
anyio
|
15 |
+
asgiref
|
16 |
+
attr
|
17 |
+
bcrypt
|
18 |
+
build
|
19 |
+
cachetools
|
20 |
+
certifi
|
21 |
+
charset-normalizer
|
22 |
+
chroma-hnswlib
|
23 |
+
chromadb
|
24 |
+
click
|
25 |
+
coloredlogs
|
26 |
+
contourpy
|
27 |
+
cycler
|
28 |
+
dataclasses-json
|
29 |
+
Deprecated
|
30 |
+
distro
|
31 |
+
dnspython
|
32 |
+
email_validator
|
33 |
+
fastapi
|
34 |
+
fastapi-cli
|
35 |
+
ffmpy
|
36 |
+
filelock
|
37 |
+
flatbuffers
|
38 |
+
fonttools
|
39 |
+
frozenlist
|
40 |
+
fsspec
|
41 |
+
google-auth
|
42 |
+
googleapis-common-protos
|
43 |
+
gradio_client
|
44 |
+
grpcio
|
45 |
+
h11
|
46 |
+
httpcore
|
47 |
+
httptools
|
48 |
+
httpx
|
49 |
+
huggingface-hub
|
50 |
+
humanfriendly
|
51 |
+
idna
|
52 |
+
importlib_metadata
|
53 |
+
importlib_resources
|
54 |
+
Jinja2
|
55 |
+
jsonpatch
|
56 |
+
jsonpointer
|
57 |
+
jsonschema
|
58 |
+
jsonschema-specifications
|
59 |
+
kiwisolver
|
60 |
+
kubernetes
|
61 |
+
langsmith
|
62 |
+
markdown-it-py
|
63 |
+
MarkupSafe
|
64 |
+
marshmallow
|
65 |
+
matplotlib
|
66 |
+
mdurl
|
67 |
+
mmh3
|
68 |
+
monotonic
|
69 |
+
mpmath
|
70 |
+
multidict
|
71 |
+
mypy-extensions
|
72 |
+
numpy
|
73 |
+
oauthlib
|
74 |
+
onnxruntime
|
75 |
+
openai
|
76 |
+
opentelemetry-api
|
77 |
+
opentelemetry-exporter-otlp-proto-common
|
78 |
+
opentelemetry-exporter-otlp-proto-grpc
|
79 |
+
opentelemetry-instrumentation
|
80 |
+
opentelemetry-instrumentation-asgi
|
81 |
+
opentelemetry-instrumentation-fastapi
|
82 |
+
opentelemetry-proto
|
83 |
+
opentelemetry-sdk
|
84 |
+
opentelemetry-semantic-conventions
|
85 |
+
opentelemetry-util-http
|
86 |
+
orjson
|
87 |
+
overrides
|
88 |
+
packaging
|
89 |
+
pandas
|
90 |
+
pillow
|
91 |
+
posthog
|
92 |
+
protobuf
|
93 |
+
pyasn1
|
94 |
+
pyasn1_modules
|
95 |
+
pydantic
|
96 |
+
pydantic_core
|
97 |
+
pydub
|
98 |
+
Pygments
|
99 |
+
pyparsing
|
100 |
+
pypdf
|
101 |
+
PyPika
|
102 |
+
pyproject_hooks
|
103 |
+
pyprojroot
|
104 |
+
python-dateutil
|
105 |
+
python-dotenv
|
106 |
+
python-multipart
|
107 |
+
pytz
|
108 |
+
PyYAML
|
109 |
+
referencing
|
110 |
+
regex
|
111 |
+
requests
|
112 |
+
requests-oauthlib
|
113 |
+
rich
|
114 |
+
rpds-py
|
115 |
+
rsa
|
116 |
+
ruff
|
117 |
+
semantic-version
|
118 |
+
setuptools
|
119 |
+
shellingham
|
120 |
+
six
|
121 |
+
sniffio
|
122 |
+
SQLAlchemy
|
123 |
+
starlette
|
124 |
+
sympy
|
125 |
+
tenacity
|
126 |
+
tiktoken
|
127 |
+
tokenizers
|
128 |
+
tomlkit
|
129 |
+
toolz
|
130 |
+
tqdm
|
131 |
+
typer
|
132 |
+
typing-inspect
|
133 |
+
typing_extensions
|
134 |
+
tzdata
|
135 |
+
ujson
|
136 |
+
urllib3
|
137 |
+
uvicorn
|
138 |
+
uvloop
|
139 |
+
watchfiles
|
140 |
+
websocket-client
|
141 |
+
websockets
|
142 |
+
wheel
|
143 |
+
wrapt
|
144 |
+
yarl
|
145 |
+
zipp
|
src/__init__.py
ADDED
File without changes
|
src/finbot.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import ast
|
4 |
+
import html
|
5 |
+
import time
|
6 |
+
import gradio as gr
|
7 |
+
from openai import OpenAI
|
8 |
+
from typing import List, Tuple
|
9 |
+
from src.load_config import LoadConfig
|
10 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
11 |
+
from langchain_core.prompts import ChatPromptTemplate
|
12 |
+
from langchain_core.output_parsers import StrOutputParser
|
13 |
+
from langchain_groq import ChatGroq
|
14 |
+
from langchain.vectorstores import Chroma
|
15 |
+
|
16 |
+
from uuid import uuid4
|
17 |
+
import os
|
18 |
+
|
19 |
+
APP_CONFIG = LoadConfig()
|
20 |
+
|
21 |
+
|
22 |
+
# URGENT NOTICE
|
23 |
+
unique_id = uuid4().hex[0:8]
|
24 |
+
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
25 |
+
os.environ["LANGCHAIN_PROJECT"] = f"Ragas_RAG_Eval"
|
26 |
+
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
|
27 |
+
|
28 |
+
class ChatBot:
|
29 |
+
"""
|
30 |
+
Class representing a chatbot with document retrieval and response generation capabilities.
|
31 |
+
|
32 |
+
This class provides static methods for responding to user queries, handling feedback, and
|
33 |
+
cleaning references from retrieved documents.
|
34 |
+
"""
|
35 |
+
vectordb = None
|
36 |
+
@staticmethod
|
37 |
+
def respond(chatbot: List, message: str, data_type: str = "Existing database", temperature: float = 0.0, model_choice: str = APP_CONFIG.llama3_70bmodel) -> Tuple:
|
38 |
+
"""
|
39 |
+
Generate a response to a user query using document retrieval and language model completion.
|
40 |
+
|
41 |
+
Parameters:
|
42 |
+
chatbot (List): List representing the chatbot's conversation history.
|
43 |
+
message (str): The user's query.
|
44 |
+
data_type (str): Type of data used for document retrieval ("Existing database" or "Upload new data").
|
45 |
+
temperature (float): Temperature parameter for language model completion.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
Tuple: A tuple containing an empty string, the updated chat history, and references from retrieved documents.
|
49 |
+
"""
|
50 |
+
|
51 |
+
# Check if the vector database needs to be created
|
52 |
+
if ChatBot.vectordb is None:
|
53 |
+
if data_type == "Existing database":
|
54 |
+
if os.path.exists(APP_CONFIG.persist_directory):
|
55 |
+
ChatBot.vectordb = Chroma(persist_directory=APP_CONFIG.persist_directory,
|
56 |
+
embedding_function=APP_CONFIG.embedding_model)
|
57 |
+
else:
|
58 |
+
chatbot.append(
|
59 |
+
(message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module. For further information please visit README.md of this repository."))
|
60 |
+
return "", chatbot, None
|
61 |
+
|
62 |
+
elif data_type == "Upload new data":
|
63 |
+
if os.path.exists(APP_CONFIG.custom_persist_directory):
|
64 |
+
ChatBot.vectordb = Chroma(persist_directory=APP_CONFIG.custom_persist_directory,
|
65 |
+
embedding_function=APP_CONFIG.embedding_model)
|
66 |
+
else:
|
67 |
+
chatbot.append(
|
68 |
+
(message, f"No file uploaded. Please first upload your files using the 'upload' button."))
|
69 |
+
return "", chatbot, None
|
70 |
+
|
71 |
+
# single step proces for embed user query, serach in vectordb, and get retrieved docs
|
72 |
+
docs = ChatBot.vectordb.similarity_search(message, k=APP_CONFIG.k)
|
73 |
+
|
74 |
+
question = "# User new question:\n" + message
|
75 |
+
retrieved_content = ChatBot.clean_references(docs)
|
76 |
+
|
77 |
+
# Memory: previous Q-n-A pairs
|
78 |
+
chat_history = f"Chat history:\n {str(chatbot[-APP_CONFIG.qa_pair_count:])}\n\n"
|
79 |
+
prompt = f"{chat_history}{retrieved_content}{question}"
|
80 |
+
print("========================")
|
81 |
+
print(prompt)
|
82 |
+
|
83 |
+
if model_choice == "gpt-3.5-turbo":
|
84 |
+
client = OpenAI()
|
85 |
+
response = client.chat.completions.create(model=model_choice,
|
86 |
+
messages=[
|
87 |
+
{"role": "system", "content": APP_CONFIG.llm_system_role},
|
88 |
+
{"role": "user", "content": prompt}
|
89 |
+
],
|
90 |
+
temperature=temperature)
|
91 |
+
print(f"Running {model_choice}...", response)
|
92 |
+
chatbot.append((message, response.choices[0].message.content))
|
93 |
+
|
94 |
+
else:
|
95 |
+
chat_llm = ChatGroq(
|
96 |
+
api_key = os.getenv("GROQ_API_KEY"),
|
97 |
+
model = model_choice,
|
98 |
+
temperature=APP_CONFIG.temperature
|
99 |
+
)
|
100 |
+
# Prompt template
|
101 |
+
prompt = ChatPromptTemplate.from_messages(
|
102 |
+
[
|
103 |
+
("system", APP_CONFIG.llm_system_role),
|
104 |
+
("human", prompt) # Directly using the message
|
105 |
+
]
|
106 |
+
)
|
107 |
+
chain = prompt | chat_llm | StrOutputParser()
|
108 |
+
response = chain.invoke({})
|
109 |
+
print("Running {model_choice} via groq...", response)
|
110 |
+
chatbot.append((message, response))
|
111 |
+
|
112 |
+
time.sleep(2)
|
113 |
+
return "", chatbot, retrieved_content
|
114 |
+
|
115 |
+
@staticmethod
|
116 |
+
def extract_content(input_text):
|
117 |
+
begin_pattern = r"""page_content='"""
|
118 |
+
end_pattern = r"""'\s*metadata="""
|
119 |
+
|
120 |
+
between_pattern = rf'{begin_pattern}(.*?){end_pattern}'
|
121 |
+
from_end_pattern = rf"{end_pattern}(.*)"
|
122 |
+
|
123 |
+
between_match = re.search(between_pattern, input_text, re.DOTALL)
|
124 |
+
from_end_match = re.search(from_end_pattern, input_text, re.DOTALL)
|
125 |
+
|
126 |
+
between_text = between_match.group(1) if between_match else None
|
127 |
+
from_end_text = from_end_match.group(1) if from_end_match else None
|
128 |
+
|
129 |
+
return between_text, from_end_text
|
130 |
+
@staticmethod
|
131 |
+
def clean_references(documents: List,) -> str:
|
132 |
+
server_url = "http://localhost:8000"
|
133 |
+
documents = [str(x)+"\n\n" for x in documents]
|
134 |
+
markdown_documents = ""
|
135 |
+
counter = 1
|
136 |
+
for doc in documents:
|
137 |
+
content, metadata = re.match(r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
|
138 |
+
metadata = metadata.split('=', 1)[1]
|
139 |
+
metadata_dict = ast.literal_eval(metadata)
|
140 |
+
content = bytes(content, "utf-8").decode("unicode_escape")
|
141 |
+
content = re.sub(r'\\n', '\n', content)
|
142 |
+
content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
|
143 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
144 |
+
content = html.unescape(content)
|
145 |
+
content = content.encode('latin1').decode('utf-8', 'ignore')
|
146 |
+
|
147 |
+
pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"
|
148 |
+
markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
|
149 |
+
f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
|
150 |
+
f"Page number: {str(metadata_dict['page'])}" + " | " +\
|
151 |
+
f"[View PDF]({pdf_url})" "\n\n"
|
152 |
+
counter += 1
|
153 |
+
|
154 |
+
return markdown_documents
|
src/load_config.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import openai
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
import yaml
|
6 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
7 |
+
from pyprojroot import here # for creating top-level directories in project without changing setwd()
|
8 |
+
import shutil
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
|
13 |
+
class LoadConfig:
|
14 |
+
"""
|
15 |
+
A class for loading configuration settings and managing directories.
|
16 |
+
|
17 |
+
This class loads various configuration settings from the 'app_config.yml' file,
|
18 |
+
including LLM configurations, retrieval configurations, and memory configurations.
|
19 |
+
It also performs directory-related operations such as creating and removing directories.
|
20 |
+
|
21 |
+
...
|
22 |
+
|
23 |
+
Attributes:
|
24 |
+
llm_engine : str
|
25 |
+
The language model engine specified in the configuration.
|
26 |
+
llm_system_role : str
|
27 |
+
The role of the language model system specified in the configuration.
|
28 |
+
persist_directory : str
|
29 |
+
The path to the persist directory where data is stored.
|
30 |
+
custom_persist_directory : str
|
31 |
+
The path to the custom persist directory.
|
32 |
+
embedding_model : OpenAIEmbeddings
|
33 |
+
An instance of the OpenAIEmbeddings class for language model embeddings.
|
34 |
+
data_directory : str
|
35 |
+
The path to the data directory.
|
36 |
+
k : int
|
37 |
+
The value of 'k' specified in the retrieval configuration.
|
38 |
+
embedding_model_engine : str
|
39 |
+
The engine specified in the embedding model configuration.
|
40 |
+
chunk_size : int
|
41 |
+
The chunk size specified in the splitter configuration.
|
42 |
+
chunk_overlap : int
|
43 |
+
The chunk overlap specified in the splitter configuration.
|
44 |
+
temperature : float
|
45 |
+
The temperature specified in the LLM configuration.
|
46 |
+
qa_pair_count : int
|
47 |
+
The number of question-answer pairs specified in the memory configuration.
|
48 |
+
|
49 |
+
Methods:
|
50 |
+
create_directory(directory_path):
|
51 |
+
Create a directory if it does not exist.
|
52 |
+
remove_directory(directory_path):
|
53 |
+
Removes the specified directory.
|
54 |
+
"""
|
55 |
+
|
56 |
+
def __init__(self) -> None:
|
57 |
+
with open(here("configs/app_config.yml")) as cfg:
|
58 |
+
app_config = yaml.load(cfg, Loader=yaml.FullLoader)
|
59 |
+
|
60 |
+
# llm configs
|
61 |
+
self.gpt_model = app_config["llm_config"]["gpt_model"]
|
62 |
+
self.llama3_70bmodel = app_config["llm_config"]["llama3_70bmodel"]
|
63 |
+
self.llm_system_role = app_config["llm_config"]["llm_system_role"]
|
64 |
+
self.persist_directory = str(here(app_config["directories"]["persist_directory"])) # converting to string for adding in chromadb backend: self._settings.require("persist_directory") + "/chroma.sqlite3"
|
65 |
+
self.custom_persist_directory = str(here(app_config["directories"]["custom_persist_directory"]))
|
66 |
+
self.embedding_model = OpenAIEmbeddings()
|
67 |
+
|
68 |
+
# Retrieval configs
|
69 |
+
self.data_directory = app_config["directories"]["data_directory"]
|
70 |
+
self.k = app_config["retrieval_config"]["k"]
|
71 |
+
self.num_of_final_doc = app_config["retrieval_config"]["num_of_final_doc"]
|
72 |
+
self.embedding_model_engine = app_config["embedding_model_config"]["engine"]
|
73 |
+
self.chunk_size = app_config["splitter_config"]["chunk_size"]
|
74 |
+
self.chunk_overlap = app_config["splitter_config"]["chunk_overlap"]
|
75 |
+
|
76 |
+
self.temperature = app_config["llm_config"]["temperature"]
|
77 |
+
|
78 |
+
# Memory
|
79 |
+
self.qa_pair_count = app_config["memory"]["qa_pair_count"]
|
80 |
+
|
81 |
+
|
82 |
+
# Load OpenAI credentials
|
83 |
+
#self.load_openai_cfg()
|
84 |
+
|
85 |
+
# clean up the upload doc vectordb if it exists
|
86 |
+
self.create_directory(self.persist_directory)
|
87 |
+
self.remove_directory(self.custom_persist_directory)
|
88 |
+
|
89 |
+
|
90 |
+
def create_directory(self, directory_path: str):
|
91 |
+
"""
|
92 |
+
Create a directory if it does not exist.
|
93 |
+
|
94 |
+
Parameters:
|
95 |
+
directory_path (str): The path of the directory to be created.
|
96 |
+
"""
|
97 |
+
if not os.path.exists(directory_path):
|
98 |
+
os.makedirs(directory_path)
|
99 |
+
|
100 |
+
def remove_directory(self, directory_path: str):
|
101 |
+
"""
|
102 |
+
Removes the specified directory.
|
103 |
+
|
104 |
+
Parameters:
|
105 |
+
directory_path (str): The path of the directory to be removed.
|
106 |
+
|
107 |
+
Raises:
|
108 |
+
OSError: If an error occurs during the directory removal process.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
None
|
112 |
+
"""
|
113 |
+
if os.path.exists(directory_path):
|
114 |
+
try:
|
115 |
+
shutil.rmtree(directory_path)
|
116 |
+
print(
|
117 |
+
f"The directory '{directory_path}' has been successfully removed.")
|
118 |
+
except OSError as e:
|
119 |
+
print(f"Error: {e}")
|
120 |
+
else:
|
121 |
+
print(f"The directory '{directory_path}' does not exist.")
|
src/prepare_bgesmall_vectordb.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
from langchain_community.document_loaders import PyPDFLoader
|
4 |
+
from langchain_community.vectorstores import Chroma
|
5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
+
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
class PrepareVectorDB:
|
11 |
+
"""
|
12 |
+
A class for preparing and saving a VectorDB using OpenAI embeddings.
|
13 |
+
|
14 |
+
Involves process of loading documents, chunking them, and creating a VectorDB
|
15 |
+
with OpenAI embeddings. contains methods to prepare & save the vecotordb.
|
16 |
+
|
17 |
+
Parameters:
|
18 |
+
data_directory (str): Directory or list of directories containing the documents.
|
19 |
+
persist_directory (str): Directory to save the VectorDB.
|
20 |
+
embedding_model_engine (str): The engine for OpenAI embeddings.
|
21 |
+
chunk_size (int): The size of the chunks for document processing.
|
22 |
+
chunk_overlap (int): The overlap between chunks.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
data_directory: str,
|
28 |
+
persist_directory: str,
|
29 |
+
embedding_model_engine: str,
|
30 |
+
chunk_size: int,
|
31 |
+
chunk_overlap: int) -> None:
|
32 |
+
|
33 |
+
"""
|
34 |
+
Initializing the PrepareVectorDB instance.
|
35 |
+
|
36 |
+
Parameters:
|
37 |
+
data_directory (str): Directory or list of directories containing the documents.
|
38 |
+
persist_directory (str): Directory to save the VectorDB.
|
39 |
+
embedding_model_engine (str): The engine for OpenAI embeddings.
|
40 |
+
chunk_size (int): The size of the chunks for document processing.
|
41 |
+
chunk_overlap (int): The overlap between chunks.
|
42 |
+
"""
|
43 |
+
|
44 |
+
self.embedding_model_engine = embedding_model_engine
|
45 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
46 |
+
chunk_size=chunk_size,
|
47 |
+
chunk_overlap=chunk_overlap,
|
48 |
+
separators=[
|
49 |
+
"\n#{1,6} ",
|
50 |
+
"```\n",
|
51 |
+
"\n\\*\\*\\*+\n",
|
52 |
+
"\n---+\n",
|
53 |
+
"\n___+\n",
|
54 |
+
"\n\n",
|
55 |
+
"\n",
|
56 |
+
" ",
|
57 |
+
"",
|
58 |
+
]
|
59 |
+
)
|
60 |
+
"""choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc."""
|
61 |
+
self.data_directory = data_directory
|
62 |
+
self.persist_directory = persist_directory
|
63 |
+
self.embedding = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
|
64 |
+
model_kwargs={'device': 'cpu'},
|
65 |
+
encode_kwargs={'normalize_embeddings': True})
|
66 |
+
|
67 |
+
def __load_all_documents(self) -> List:
|
68 |
+
"""
|
69 |
+
Load all documents from the specified directory or directories and
|
70 |
+
handles the documents obtained live during chat.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
List: A list of loaded documents.
|
74 |
+
"""
|
75 |
+
doc_counter = 0
|
76 |
+
if isinstance(self.data_directory, list):
|
77 |
+
print("Loading the uploaded documents...")
|
78 |
+
docs = [doc for doc_dir in self.data_directory
|
79 |
+
for doc in PyPDFLoader(doc_dir).load()]
|
80 |
+
else:
|
81 |
+
print("Loading documents manually...")
|
82 |
+
document_list = os.listdir(self.data_directory)
|
83 |
+
docs = [doc for doc_name in document_list
|
84 |
+
for doc in PyPDFLoader(os.path.join(
|
85 |
+
self.data_directory, doc_name)).load()]
|
86 |
+
doc_counter = len(docs)
|
87 |
+
print(f"Number of loaded documents: {doc_counter}")
|
88 |
+
print(f"Number of pages: {len(docs)}\n\n")
|
89 |
+
|
90 |
+
return docs
|
91 |
+
|
92 |
+
def __chunk_documents(self, docs: List) -> List:
|
93 |
+
"""
|
94 |
+
Chunk the loaded documents using the specified text splitter.
|
95 |
+
Parameters:
|
96 |
+
docs (List): The list of loaded documents.
|
97 |
+
Returns:
|
98 |
+
List: A list of chunked documents.
|
99 |
+
"""
|
100 |
+
print("Chunking documents...")
|
101 |
+
chunked_documents = self.text_splitter.split_documents(docs)
|
102 |
+
print("Number of chunks:", len(chunked_documents), "\n\n")
|
103 |
+
return chunked_documents
|
104 |
+
|
105 |
+
def prepare_and_save_vectordb(self):
|
106 |
+
"""
|
107 |
+
Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
Chroma: The created VectorDB.
|
111 |
+
"""
|
112 |
+
docs = self.__load_all_documents()
|
113 |
+
chunked_documents = self.__chunk_documents(docs)
|
114 |
+
print("Preparing vectordb...")
|
115 |
+
vectordb = Chroma.from_documents(
|
116 |
+
documents=chunked_documents,
|
117 |
+
embedding=self.embedding,
|
118 |
+
persist_directory=self.persist_directory
|
119 |
+
)
|
120 |
+
print("Vectordb created and saved!")
|
121 |
+
print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
|
122 |
+
return vectordb
|
123 |
+
|
src/prepare_openAIEmbeddings_vectordb.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
from langchain_community.document_loaders import PyPDFLoader
|
4 |
+
from langchain_community.vectorstores import Chroma
|
5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_openai import OpenAIEmbeddings
|
7 |
+
|
8 |
+
|
9 |
+
class PrepareVectorDB:
|
10 |
+
"""
|
11 |
+
A class for preparing and saving a VectorDB using OpenAI embeddings.
|
12 |
+
|
13 |
+
Involves process of loading documents, chunking them, and creating a VectorDB
|
14 |
+
with OpenAI embeddings. contains methods to prepare & save the vecotordb.
|
15 |
+
|
16 |
+
Parameters:
|
17 |
+
data_directory (str): Directory or list of directories containing the documents.
|
18 |
+
persist_directory (str): Directory to save the VectorDB.
|
19 |
+
embedding_model_engine (str): The engine for OpenAI embeddings.
|
20 |
+
chunk_size (int): The size of the chunks for document processing.
|
21 |
+
chunk_overlap (int): The overlap between chunks.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(
|
25 |
+
self,
|
26 |
+
data_directory: str,
|
27 |
+
persist_directory: str,
|
28 |
+
embedding_model_engine: str,
|
29 |
+
chunk_size: int,
|
30 |
+
chunk_overlap: int) -> None:
|
31 |
+
|
32 |
+
"""
|
33 |
+
Initializing the PrepareVectorDB instance.
|
34 |
+
|
35 |
+
Parameters:
|
36 |
+
data_directory (str): Directory or list of directories containing the documents.
|
37 |
+
persist_directory (str): Directory to save the VectorDB.
|
38 |
+
embedding_model_engine (str): The engine for OpenAI embeddings.
|
39 |
+
chunk_size (int): The size of the chunks for document processing.
|
40 |
+
chunk_overlap (int): The overlap between chunks.
|
41 |
+
"""
|
42 |
+
|
43 |
+
self.embedding_model_engine = embedding_model_engine
|
44 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
45 |
+
chunk_size=chunk_size,
|
46 |
+
chunk_overlap=chunk_overlap,
|
47 |
+
separators = [
|
48 |
+
"\n#{1,6} ",
|
49 |
+
"```\n",
|
50 |
+
"\n\\*\\*\\*+\n",
|
51 |
+
"\n---+\n",
|
52 |
+
"\n___+\n",
|
53 |
+
"\n\n",
|
54 |
+
"\n",
|
55 |
+
" ",
|
56 |
+
"",
|
57 |
+
]
|
58 |
+
)
|
59 |
+
"""choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc."""
|
60 |
+
self.data_directory = data_directory
|
61 |
+
self.persist_directory = persist_directory
|
62 |
+
self.embedding = OpenAIEmbeddings()
|
63 |
+
|
64 |
+
def __load_all_documents(self) -> List:
|
65 |
+
"""
|
66 |
+
Load all documents from the specified directory or directories and
|
67 |
+
handles the documents obtained live during chat.
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
List: A list of loaded documents.
|
71 |
+
"""
|
72 |
+
doc_counter = 0
|
73 |
+
if isinstance(self.data_directory, list):
|
74 |
+
print("Loading the uploaded documents...")
|
75 |
+
docs = [doc for doc_dir in self.data_directory
|
76 |
+
for doc in PyPDFLoader(doc_dir).load()]
|
77 |
+
else:
|
78 |
+
print("Loading documents manually...")
|
79 |
+
document_list = os.listdir(self.data_directory)
|
80 |
+
docs = [doc for doc_name in document_list
|
81 |
+
for doc in PyPDFLoader(os.path.join(
|
82 |
+
self.data_directory, doc_name)).load()]
|
83 |
+
doc_counter = len(docs)
|
84 |
+
print(f"Number of loaded documents: {doc_counter}")
|
85 |
+
print(f"Number of pages: {len(docs)}\n\n")
|
86 |
+
|
87 |
+
return docs
|
88 |
+
|
89 |
+
def __chunk_documents(self, docs: List) -> List:
|
90 |
+
"""
|
91 |
+
Chunk the loaded documents using the specified text splitter.
|
92 |
+
Parameters:
|
93 |
+
docs (List): The list of loaded documents.
|
94 |
+
Returns:
|
95 |
+
List: A list of chunked documents.
|
96 |
+
"""
|
97 |
+
print("Chunking documents...")
|
98 |
+
chunked_documents = self.text_splitter.split_documents(docs)
|
99 |
+
print("Number of chunks:", len(chunked_documents), "\n\n")
|
100 |
+
return chunked_documents
|
101 |
+
|
102 |
+
def prepare_and_save_vectordb(self):
|
103 |
+
"""
|
104 |
+
Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
Chroma: The created VectorDB.
|
108 |
+
"""
|
109 |
+
docs = self.__load_all_documents()
|
110 |
+
chunked_documents = self.__chunk_documents(docs)
|
111 |
+
print("Preparing vectordb...")
|
112 |
+
vectordb = Chroma.from_documents(
|
113 |
+
documents=chunked_documents,
|
114 |
+
embedding=self.embedding,
|
115 |
+
persist_directory=self.persist_directory
|
116 |
+
)
|
117 |
+
print("Vectordb created and saved!")
|
118 |
+
print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
|
119 |
+
return vectordb
|
120 |
+
|
src/ui_settings.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
|
4 |
+
class UISettings:
|
5 |
+
"""
|
6 |
+
Utility class for managing UI settings.
|
7 |
+
|
8 |
+
This class provides static methods for toggling UI components, such as a sidebar and feedback.
|
9 |
+
"""
|
10 |
+
@staticmethod
|
11 |
+
def toggle_sidebar(state):
|
12 |
+
"""
|
13 |
+
Toggle the visibility state of a UI component.
|
14 |
+
|
15 |
+
Parameters:
|
16 |
+
state: The current state of the UI component.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
Tuple: A tuple containing the updated UI component state and the new state.
|
20 |
+
"""
|
21 |
+
state = not state
|
22 |
+
return gr.update(visible=state), state
|
23 |
+
|
24 |
+
@staticmethod
|
25 |
+
def feedback(data: gr.LikeData):
|
26 |
+
"""
|
27 |
+
Process user feedback on the generated response.
|
28 |
+
|
29 |
+
Parameters:
|
30 |
+
data (gr.LikeData): Gradio LikeData object containing user feedback.
|
31 |
+
"""
|
32 |
+
if data.liked:
|
33 |
+
print("You upvoted this response: " + data.value)
|
34 |
+
else:
|
35 |
+
print("You downvoted this response: " + data.value)
|
src/upload_data_manually.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
|
4 |
+
#comment this if OPENAI EMbeddings are requireed
|
5 |
+
#from prepare_bgesmall_vectordb import PrepareVectorDB
|
6 |
+
from prepare_openAIEmbeddings_vectordb import PrepareVectorDB
|
7 |
+
|
8 |
+
from load_config import LoadConfig
|
9 |
+
CONFIG = LoadConfig()
|
10 |
+
|
11 |
+
|
12 |
+
def upload_data_manually() -> None:
|
13 |
+
"""
|
14 |
+
Uploads data manually to the VectorDB.
|
15 |
+
|
16 |
+
This function initializes a PrepareVectorDB instance with configuration parameters
|
17 |
+
such as data_directory, persist_directory, embedding_model_engine, chunk_size,
|
18 |
+
and chunk_overlap. It then checks if the VectorDB already exists in the specified
|
19 |
+
persist_directory. If not, it calls the prepare_and_save_vectordb method to
|
20 |
+
create and save the VectorDB. If the VectorDB already exists, a message is printed
|
21 |
+
indicating its presence.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
None
|
25 |
+
"""
|
26 |
+
prepare_vectordb_instance = PrepareVectorDB(
|
27 |
+
data_directory=CONFIG.data_directory,
|
28 |
+
persist_directory=CONFIG.persist_directory,
|
29 |
+
embedding_model_engine=CONFIG.embedding_model_engine,
|
30 |
+
chunk_size=CONFIG.chunk_size,
|
31 |
+
chunk_overlap=CONFIG.chunk_overlap,
|
32 |
+
)
|
33 |
+
if not len(os.listdir(CONFIG.persist_directory)) != 0:
|
34 |
+
prepare_vectordb_instance.prepare_and_save_vectordb()
|
35 |
+
else:
|
36 |
+
print(f"VectorDB already exists in {CONFIG.persist_directory}")
|
37 |
+
return None
|
38 |
+
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
upload_data_manually()
|
src/upload_file.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.prepare_openAIEmbeddings_vectordb import PrepareVectorDB
|
2 |
+
from typing import List, Tuple
|
3 |
+
from src.load_config import LoadConfig
|
4 |
+
|
5 |
+
APP_CONFIG = LoadConfig()
|
6 |
+
|
7 |
+
|
8 |
+
class UploadFile:
|
9 |
+
"""
|
10 |
+
Utility class for handling file uploads and processing.
|
11 |
+
|
12 |
+
This class provides static methods for checking directories and processing uploaded files
|
13 |
+
to prepare a VectorDB.
|
14 |
+
"""
|
15 |
+
|
16 |
+
@staticmethod
|
17 |
+
def process_uploaded_files(files_dir: List, chatbot: List, rag_with_dropdown: str) -> Tuple:
|
18 |
+
"""
|
19 |
+
Prepares and saves a VectorDB from uploaded files.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
files_dir (List): List of paths to the uploaded files.
|
23 |
+
chatbot: An instance of the chatbot for communication.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
Tuple: A tuple containing an empty string and the updated chatbot instance.
|
27 |
+
"""
|
28 |
+
if rag_with_dropdown == "Upload docs to chat with:":
|
29 |
+
prepare_vectordb_instance = PrepareVectorDB(data_directory=files_dir,
|
30 |
+
persist_directory=APP_CONFIG.custom_persist_directory,
|
31 |
+
embedding_model_engine=APP_CONFIG.embedding_model_engine,
|
32 |
+
chunk_size=APP_CONFIG.chunk_size,
|
33 |
+
chunk_overlap=APP_CONFIG.chunk_overlap)
|
34 |
+
prepare_vectordb_instance.prepare_and_save_vectordb()
|
35 |
+
chatbot.append(
|
36 |
+
(" ", "Uploaded files are ready for querying."))
|
37 |
+
else:
|
38 |
+
chatbot.append(
|
39 |
+
(" ", "If you want to upload your own PDF, please select 'rag_with' from the dropdown."))
|
40 |
+
return "", chatbot
|