yinong333 commited on
Commit
cca3a35
·
1 Parent(s): e79acb8

update midterm

Browse files
Files changed (3) hide show
  1. .chainlit/config.toml +84 -0
  2. .gitignore +160 -0
  3. app.py +22 -33
.chainlit/config.toml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+ # List of environment variables to be provided by each user to use the app.
6
+ user_env = []
7
+
8
+ # Duration (in seconds) during which the session is saved when the connection is lost
9
+ session_timeout = 3600
10
+
11
+ # Enable third parties caching (e.g LangChain cache)
12
+ cache = false
13
+
14
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
15
+ # follow_symlink = false
16
+
17
+ [features]
18
+ # Show the prompt playground
19
+ prompt_playground = true
20
+
21
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
22
+ unsafe_allow_html = false
23
+
24
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
25
+ latex = false
26
+
27
+ # Authorize users to upload files with messages
28
+ multi_modal = true
29
+
30
+ # Allows user to use speech to text
31
+ [features.speech_to_text]
32
+ enabled = false
33
+ # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
34
+ # language = "en-US"
35
+
36
+ [UI]
37
+ # Name of the app and chatbot.
38
+ name = "Chatbot"
39
+
40
+ # Show the readme while the conversation is empty.
41
+ show_readme_as_default = true
42
+
43
+ # Description of the app and chatbot. This is used for HTML tags.
44
+ # description = ""
45
+
46
+ # Large size content are by default collapsed for a cleaner ui
47
+ default_collapse_content = true
48
+
49
+ # The default value for the expand messages settings.
50
+ default_expand_messages = false
51
+
52
+ # Hide the chain of thought details from the user in the UI.
53
+ hide_cot = false
54
+
55
+ # Link to your github repo. This will add a github button in the UI's header.
56
+ # github = ""
57
+
58
+ # Specify a CSS file that can be used to customize the user interface.
59
+ # The CSS file can be served from the public directory or via an external link.
60
+ # custom_css = "/public/test.css"
61
+
62
+ # Override default MUI light theme. (Check theme.ts)
63
+ [UI.theme.light]
64
+ #background = "#FAFAFA"
65
+ #paper = "#FFFFFF"
66
+
67
+ [UI.theme.light.primary]
68
+ #main = "#F80061"
69
+ #dark = "#980039"
70
+ #light = "#FFE7EB"
71
+
72
+ # Override default MUI dark theme. (Check theme.ts)
73
+ [UI.theme.dark]
74
+ #background = "#FAFAFA"
75
+ #paper = "#FFFFFF"
76
+
77
+ [UI.theme.dark.primary]
78
+ #main = "#F80061"
79
+ #dark = "#980039"
80
+ #light = "#FFE7EB"
81
+
82
+
83
+ [meta]
84
+ generated_by = "0.7.700"
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
app.py CHANGED
@@ -44,35 +44,27 @@ prompt = ChatPromptTemplate.from_messages(messages)
44
  chain_type_kwargs = {"prompt": prompt}
45
 
46
 
47
- def generate_vdb(chunks=None):
48
  EMBEDDING_MODEL = "text-embedding-3-small"
49
  embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
50
- PERSIST_PATH = "./qdrant_vector_db" # Directory to store Qdrant collection
51
- COLLECTION_NAME = "legal_data"
52
  VECTOR_SIZE = 1536
53
 
54
- # Check if the vector database already exists
55
- if os.path.exists(PERSIST_PATH):
56
- print(f"Loading existing Qdrant database from {PERSIST_PATH}")
57
- qdrant_client = QdrantClient(path=PERSIST_PATH) # Load the existing DB
58
- qdrant_vector_store = QdrantVectorStore(
59
- client=qdrant_client,
60
- collection_name=COLLECTION_NAME,
61
- embedding=embeddings,
62
- )
63
- else:
64
- print(f"Creating new Qdrant database at {PERSIST_PATH}")
65
- qdrant_client = QdrantClient(path=PERSIST_PATH) # Create a new DB
66
- qdrant_client.create_collection(
67
- collection_name=COLLECTION_NAME,
68
- vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
69
- )
70
- qdrant_vector_store = QdrantVectorStore(
71
- client=qdrant_client,
72
- collection_name=COLLECTION_NAME,
73
- embedding=embeddings,
74
- )
75
- qdrant_vector_store.add_documents(chunks)
76
  return qdrant_vector_store
77
 
78
 
@@ -87,12 +79,11 @@ async def on_chat_start():
87
  "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
88
  "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"]
89
 
90
- if not os.path.exists("./qdrant_vector_db"):
91
- documents = []
92
- for pdf_link in pdf_links:
93
- loader = PyMuPDFLoader(pdf_link)
94
- loaded_docs = loader.load()
95
- documents.extend(loaded_docs)
96
 
97
  CHUNK_SIZE = 1000
98
  CHUNK_OVERLAP = 200
@@ -105,8 +96,6 @@ async def on_chat_start():
105
  split_chunks = text_splitter.split_documents(documents)
106
 
107
  docsearch = generate_vdb(split_chunks)
108
- else:
109
- docsearch = generate_vdb()
110
 
111
  # Let the user know that the system is ready
112
  msg = cl.Message(
 
44
  chain_type_kwargs = {"prompt": prompt}
45
 
46
 
47
+ def generate_vdb(chunks):
48
  EMBEDDING_MODEL = "text-embedding-3-small"
49
  embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
50
+ LOCATION = ":memory:"
51
+ COLLECTION_NAME = "legal data"
52
  VECTOR_SIZE = 1536
53
 
54
+ qdrant_client = QdrantClient(LOCATION)
55
+
56
+ qdrant_client.create_collection(
57
+ collection_name=COLLECTION_NAME,
58
+ vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
59
+ )
60
+
61
+ qdrant_vector_store = QdrantVectorStore(
62
+ client=qdrant_client,
63
+ collection_name=COLLECTION_NAME,
64
+ embedding=embeddings,
65
+ )
66
+
67
+ qdrant_vector_store.add_documents(chunks)
 
 
 
 
 
 
 
 
68
  return qdrant_vector_store
69
 
70
 
 
79
  "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
80
  "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"]
81
 
82
+ documents = []
83
+ for pdf_link in pdf_links:
84
+ loader = PyMuPDFLoader(pdf_link)
85
+ loaded_docs = loader.load()
86
+ documents.extend(loaded_docs)
 
87
 
88
  CHUNK_SIZE = 1000
89
  CHUNK_OVERLAP = 200
 
96
  split_chunks = text_splitter.split_documents(documents)
97
 
98
  docsearch = generate_vdb(split_chunks)
 
 
99
 
100
  # Let the user know that the system is ready
101
  msg = cl.Message(