Stéphanie Kamgnia Wonkap commited on
Commit
a6e92fe
·
1 Parent(s): 8efbea2

initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #env files
2
+ .env
3
+ #virtual env
4
+ venv
README.md CHANGED
@@ -7,7 +7,7 @@ sdk: streamlit
7
  sdk_version: 1.39.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: UN rag pour explorer le livre le collège de pediatrie
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 1.39.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Un rag pour explorer le livre le collège de pediatrie
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Databricks notebook source
2
+ import streamlit as st
3
+ import os
4
+ import yaml
5
+ from dotenv import load_dotenv
6
+ from src.generator import answer_with_rag
7
+ from ragatouille import RAGPretrainedModel
8
+ from src.data_preparation import split_documents
9
+ from transformers import pipeline
10
+ from langchain_community.document_loaders import PyPDFLoader
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+ from src.retriever import init_vectorDB_from_doc, retriever
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
14
+ from langchain_community.vectorstores import FAISS
15
+ import faiss
16
+ def load_config():
17
+ with open("./src/config.yml","r") as file_object:
18
+ try:
19
+ cfg=yaml.safe_load(file_object)
20
+
21
+ except yaml.YAMLError as exc:
22
+ logger.error(str(exc))
23
+ raise
24
+ else:
25
+ return cfg
26
+
27
+ cfg= load_config()
28
+ load_dotenv("./src/.env")
29
+
30
+ EMBEDDING_MODEL_NAME=cfg['EMBEDDING_MODEL_NAME']
31
+ DATA_FILE_PATH=cfg['DATA_FILE_PATH']
32
+ READER_MODEL_NAME=cfg['READER_MODEL_NAME']
33
+ RERANKER_MODEL_NAME=cfg['RERANKER_MODEL_NAME']
34
+ VECTORDB_PATH=cfg['VECTORDB_PATH']
35
+ if __name__ == "__main__":
36
+ st.title("RAG App to query le College de Pédiatrie")
37
+
38
+ user_query = st.text_input("Entrez votre question:")
39
+
40
+
41
+ # Initialize the retriever and LLM
42
+
43
+ loader = PyPDFLoader(DATA_FILE_PATH)
44
+ #loader = PyPDFDirectoryLoader(DATA_FILE_PATH)
45
+ raw_document_base = loader.load()
46
+ MARKDOWN_SEPARATORS = [
47
+ "\n#{1,6} ",
48
+ "```\n",
49
+ "\n\\*\\*\\*+\n",
50
+ "\n---+\n",
51
+ "\n___+\n",
52
+ "\n\n",
53
+ "\n",
54
+ " ",
55
+ "",]
56
+ docs_processed = split_documents(
57
+ 512, # We choose a chunk size adapted to our model
58
+ raw_document_base,
59
+ tokenizer_name=EMBEDDING_MODEL_NAME,
60
+ separator=MARKDOWN_SEPARATORS
61
+ )
62
+ embedding_model=init_embedding_model(EMBEDDING_MODEL_NAME)
63
+
64
+ if os.path.exists(VECTORDB_PATH):
65
+ new_vector_store = FAISS.load_local(
66
+ VECTORDB_PATH, embedding_model,
67
+ allow_dangerous_deserialization=True)
68
+ else:
69
+ KNOWLEDGE_VECTOR_DATABASE=init_vectorDB_from_doc(docs_processed, embedding_model)
70
+ KNOWLEDGE_VECTOR_DATABASE.save_local(VECTORDB_PATH)
71
+
72
+
73
+ if st.button("Get Answer"):
74
+ # Get the answer and relevant documents
75
+ bnb_config = BitsAndBytesConfig(
76
+ load_in_4bit=True,
77
+ bnb_4bit_use_double_quant=True,
78
+ bnb_4bit_quant_type="nf4",
79
+ bnb_4bit_compute_dtype=torch.bfloat16,
80
+ )
81
+ model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
82
+ tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
83
+
84
+ READER_LLM = pipeline(
85
+ model=model,
86
+ tokenizer=tokenizer,
87
+ task="text-generation",
88
+ do_sample=True,
89
+ temperature=0.2,
90
+ repetition_penalty=1.1,
91
+ return_full_text=False,
92
+ max_new_tokens=500,
93
+ )
94
+ RERANKER = RAGPretrainedModel.from_pretrained(RERANKER_MODEL_NAME)
95
+ num_doc_before_rerank=15
96
+ num_final_releveant_docs=5
97
+ answer, relevant_docs = answer_with_rag(query=user_query, READER_MODEL_NAME=READER_MODEL_NAME,embedding_model=embedding_model,vectorDB=KNOWLEDGE_VECTOR_DATABASE,reranker=RERANKER, llm=READER_LLM,num_doc_before_rerank=num_doc_before_rerank,num_final_relevant_docs=num_final_releveant_docs,rerank=True)
98
+ #print(answer)
99
+
100
+
101
+ # Display the answer
102
+ st.write("### Answer:")
103
+ st.write(answer)
104
+
105
+ # Display the relevant documents
106
+ st.write("### Relevant Documents:")
107
+ for i, doc in enumerate(relevant_docs):
108
+ st.write(f"Document {i}:\n{doc.text}")
config.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ EMBEDDING_MODEL_NAME: "OrdalieTech/Solon-embeddings-large-0.1"
2
+ READER_MODEL_NAME: "mistralai/Mistral-7B-Instruct-v0.3"
3
+ RERANKER_MODEL_NAME: "colbert-ir/colbertv2.0"
4
+ VECTORDB_PATH: "./vectorDB/KNOWLEDGE_VECTOR_DATABASE_index"
5
+ DATA_FILE_PATH: "./data/College_pediatrie_2024.pdf"
data/college_pediatrie_2024.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaa5b6383d120dd1eda9048b230211deeabee2cdba3803caf7e4e40e21774c30
3
+ size 141324090
requirements.txt ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.4.0
2
+ accelerate==0.34.2
3
+ aiohappyeyeballs==2.4.3
4
+ aiohttp==3.10.10
5
+ aiosignal==1.3.1
6
+ alabaster==0.7.16
7
+ albucore==0.0.19
8
+ albumentations==1.4.20
9
+ altair==4.2.2
10
+ annotated-types==0.7.0
11
+ annoy==1.17.3
12
+ anyio==3.7.1
13
+ argon2-cffi==23.1.0
14
+ argon2-cffi-bindings==21.2.0
15
+ array_record==0.5.1
16
+ arviz==0.20.0
17
+ astropy==6.1.4
18
+ astropy-iers-data==0.2024.10.28.0.34.7
19
+ astunparse==1.6.3
20
+ async-timeout==4.0.3
21
+ atpublic==4.1.0
22
+ attrs==24.2.0
23
+ audioread==3.0.1
24
+ autograd==1.7.0
25
+ babel==2.16.0
26
+ backcall==0.2.0
27
+ beautifulsoup4==4.12.3
28
+ bigframes==1.25.0
29
+ bigquery-magics==0.4.0
30
+ bitarray==3.0.0
31
+ bitsandbytes==0.44.1
32
+ bleach==6.2.0
33
+ blinker==1.4
34
+ blis==0.7.11
35
+ blosc2==2.0.0
36
+ bokeh==3.4.3
37
+ Bottleneck==1.4.2
38
+ bqplot==0.12.43
39
+ branca==0.8.0
40
+ CacheControl==0.14.0
41
+ cachetools==5.5.0
42
+ catalogue==2.0.10
43
+ certifi==2024.8.30
44
+ cffi==1.17.1
45
+ chardet==5.2.0
46
+ charset-normalizer==3.4.0
47
+ chex==0.1.87
48
+ clarabel==0.9.0
49
+ click==8.1.7
50
+ cloudpathlib==0.20.0
51
+ cloudpickle==3.1.0
52
+ cmake==3.30.5
53
+ cmdstanpy==1.2.4
54
+ colbert-ai==0.2.19
55
+ colorcet==3.1.0
56
+ colorlover==0.3.0
57
+ colour==0.1.5
58
+ community==1.0.0b1
59
+ confection==0.1.5
60
+ cons==0.4.6
61
+ contourpy==1.3.0
62
+ cryptography==43.0.3
63
+ cuda-python==12.2.1
64
+ cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.10.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
65
+ cufflinks==0.17.3
66
+ cupy-cuda12x==12.2.0
67
+ cvxopt==1.3.2
68
+ cvxpy==1.5.3
69
+ cycler==0.12.1
70
+ cymem==2.0.8
71
+ Cython==3.0.11
72
+ dask==2024.10.0
73
+ dataclasses-json==0.6.7
74
+ datascience==0.17.6
75
+ datasets==3.1.0
76
+ db-dtypes==1.3.0
77
+ dbus-python==1.2.18
78
+ debugpy==1.6.6
79
+ decorator==4.4.2
80
+ defusedxml==0.7.1
81
+ Deprecated==1.2.14
82
+ diffusers==0.30.3
83
+ dill==0.3.8
84
+ dirtyjson==1.0.8
85
+ distro==1.9.0
86
+ dlib==19.24.2
87
+ dm-tree==0.1.8
88
+ docker-pycreds==0.4.0
89
+ docstring_parser==0.16
90
+ docutils==0.18.1
91
+ dopamine_rl==4.0.9
92
+ duckdb==1.1.2
93
+ earthengine-api==1.2.0
94
+ easydict==1.13
95
+ ecos==2.0.14
96
+ editdistance==0.8.1
97
+ eerepr==0.0.4
98
+ einops==0.8.0
99
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
100
+ entrypoints==0.4
101
+ et_xmlfile==2.0.0
102
+ etils==1.10.0
103
+ etuples==0.3.9
104
+ eval_type_backport==0.2.0
105
+ exceptiongroup==1.2.2
106
+ faiss-cpu==1.9.0
107
+ faiss-gpu==1.7.2
108
+ fast-pytorch-kmeans==0.2.0.1
109
+ fastai==2.7.18
110
+ fastcore==1.7.19
111
+ fastdownload==0.0.7
112
+ fastjsonschema==2.20.0
113
+ fastprogress==1.0.3
114
+ fastrlock==0.8.2
115
+ filelock==3.16.1
116
+ firebase-admin==6.5.0
117
+ Flask==2.2.5
118
+ flatbuffers==24.3.25
119
+ flax==0.8.5
120
+ folium==0.18.0
121
+ fonttools==4.54.1
122
+ frozendict==2.4.6
123
+ frozenlist==1.5.0
124
+ fsspec==2024.9.0
125
+ future==1.0.0
126
+ gast==0.6.0
127
+ gcsfs==2024.10.0
128
+ GDAL==3.6.4
129
+ gdown==5.2.0
130
+ geemap==0.35.0
131
+ gensim==4.3.3
132
+ geocoder==1.38.1
133
+ geographiclib==2.0
134
+ geopandas==1.0.1
135
+ geopy==2.4.1
136
+ gin-config==0.5.0
137
+ git-python==1.0.3
138
+ gitdb==4.0.11
139
+ GitPython==3.1.43
140
+ glob2==0.7
141
+ google==2.0.3
142
+ google-ai-generativelanguage==0.6.10
143
+ google-api-core==2.19.2
144
+ google-api-python-client==2.137.0
145
+ google-auth==2.27.0
146
+ google-auth-httplib2==0.2.0
147
+ google-auth-oauthlib==1.2.1
148
+ google-cloud-aiplatform==1.70.0
149
+ google-cloud-bigquery==3.25.0
150
+ google-cloud-bigquery-connection==1.15.5
151
+ google-cloud-bigquery-storage==2.27.0
152
+ google-cloud-bigtable==2.26.0
153
+ google-cloud-core==2.4.1
154
+ google-cloud-datastore==2.19.0
155
+ google-cloud-firestore==2.16.1
156
+ google-cloud-functions==1.16.5
157
+ google-cloud-iam==2.16.0
158
+ google-cloud-language==2.13.4
159
+ google-cloud-pubsub==2.25.0
160
+ google-cloud-resource-manager==1.13.0
161
+ google-cloud-storage==2.8.0
162
+ google-cloud-translate==3.15.5
163
+ google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
164
+ google-crc32c==1.6.0
165
+ google-generativeai==0.8.3
166
+ google-pasta==0.2.0
167
+ google-resumable-media==2.7.2
168
+ googleapis-common-protos==1.65.0
169
+ googledrivedownloader==0.4
170
+ graphviz==0.20.3
171
+ greenlet==3.1.1
172
+ grpc-google-iam-v1==0.13.1
173
+ grpcio==1.64.1
174
+ grpcio-status==1.48.2
175
+ gspread==6.0.2
176
+ gspread-dataframe==3.3.1
177
+ gym==0.25.2
178
+ gym-notices==0.0.8
179
+ h11==0.14.0
180
+ h5netcdf==1.4.0
181
+ h5py==3.12.1
182
+ holidays==0.59
183
+ holoviews==1.19.1
184
+ html5lib==1.1
185
+ httpcore==1.0.6
186
+ httpimport==1.4.0
187
+ httplib2==0.22.0
188
+ httpx==0.27.2
189
+ httpx-sse==0.4.0
190
+ huggingface-hub==0.24.7
191
+ humanize==4.11.0
192
+ hyperopt==0.2.7
193
+ ibis-framework==9.2.0
194
+ idna==3.10
195
+ imageio==2.36.0
196
+ imageio-ffmpeg==0.5.1
197
+ imagesize==1.4.1
198
+ imbalanced-learn==0.12.4
199
+ imgaug==0.4.0
200
+ immutabledict==4.2.0
201
+ importlib_metadata==8.5.0
202
+ importlib_resources==6.4.5
203
+ imutils==0.5.4
204
+ inflect==7.4.0
205
+ iniconfig==2.0.0
206
+ intel-cmplr-lib-ur==2025.0.0
207
+ intel-openmp==2025.0.0
208
+ ipyevents==2.0.2
209
+ ipyfilechooser==0.6.0
210
+ ipykernel==5.5.6
211
+ ipyleaflet==0.19.2
212
+ ipyparallel==8.8.0
213
+ ipython==7.34.0
214
+ ipython-genutils==0.2.0
215
+ ipython-sql==0.5.0
216
+ ipytree==0.2.2
217
+ ipywidgets==7.7.1
218
+ itsdangerous==2.2.0
219
+ jax==0.4.33
220
+ jax-cuda12-pjrt==0.4.33
221
+ jax-cuda12-plugin==0.4.33
222
+ jaxlib==0.4.33
223
+ jeepney==0.7.1
224
+ jellyfish==1.1.0
225
+ jieba==0.42.1
226
+ Jinja2==3.1.4
227
+ jiter==0.6.1
228
+ joblib==1.4.2
229
+ jsonpatch==1.33
230
+ jsonpickle==3.3.0
231
+ jsonpointer==3.0.0
232
+ jsonschema==4.23.0
233
+ jsonschema-specifications==2024.10.1
234
+ jupyter-client==6.1.12
235
+ jupyter-console==6.1.0
236
+ jupyter-leaflet==0.19.2
237
+ jupyter-server==1.24.0
238
+ jupyter_core==5.7.2
239
+ jupyterlab_pygments==0.3.0
240
+ jupyterlab_widgets==3.0.13
241
+ kaggle==1.6.17
242
+ kagglehub==0.3.3
243
+ keras==3.4.1
244
+ keyring==23.5.0
245
+ kiwisolver==1.4.7
246
+ langchain==0.3.7
247
+ langchain-community==0.3.5
248
+ langchain-core==0.3.15
249
+ langchain-huggingface==0.1.2
250
+ langchain-openai==0.2.6
251
+ langchain-text-splitters==0.3.0
252
+ langcodes==3.4.1
253
+ langsmith==0.1.137
254
+ language_data==1.2.0
255
+ launchpadlib==1.10.16
256
+ lazr.restfulclient==0.14.4
257
+ lazr.uri==1.0.6
258
+ lazy_loader==0.4
259
+ libclang==18.1.1
260
+ libcudf-cu12 @ https://pypi.nvidia.com/libcudf-cu12/libcudf_cu12-24.10.1-py3-none-manylinux_2_28_x86_64.whl
261
+ librosa==0.10.2.post1
262
+ lightgbm==4.5.0
263
+ linkify-it-py==2.0.3
264
+ llama-cloud==0.1.4
265
+ llama-index==0.11.22
266
+ llama-index-agent-openai==0.3.4
267
+ llama-index-cli==0.3.1
268
+ llama-index-core==0.11.22
269
+ llama-index-embeddings-openai==0.2.5
270
+ llama-index-indices-managed-llama-cloud==0.4.0
271
+ llama-index-legacy==0.9.48.post3
272
+ llama-index-llms-openai==0.2.16
273
+ llama-index-multi-modal-llms-openai==0.2.3
274
+ llama-index-program-openai==0.2.0
275
+ llama-index-question-gen-openai==0.2.0
276
+ llama-index-readers-file==0.2.2
277
+ llama-index-readers-llama-parse==0.3.0
278
+ llama-parse==0.5.13
279
+ llvmlite==0.43.0
280
+ locket==1.0.0
281
+ logical-unification==0.4.6
282
+ lxml==5.3.0
283
+ marisa-trie==1.2.1
284
+ Markdown==3.7
285
+ markdown-it-py==3.0.0
286
+ MarkupSafe==3.0.2
287
+ marshmallow==3.23.1
288
+ matplotlib==3.8.0
289
+ matplotlib-inline==0.1.7
290
+ matplotlib-venn==1.1.1
291
+ mdit-py-plugins==0.4.2
292
+ mdurl==0.1.2
293
+ miniKanren==1.0.3
294
+ missingno==0.5.2
295
+ mistune==3.0.2
296
+ mizani==0.13.0
297
+ mkl==2024.2.2
298
+ ml-dtypes==0.4.1
299
+ mlxtend==0.23.1
300
+ more-itertools==10.5.0
301
+ moviepy==1.0.3
302
+ mpmath==1.3.0
303
+ msgpack==1.1.0
304
+ multidict==6.1.0
305
+ multipledispatch==1.0.0
306
+ multiprocess==0.70.16
307
+ multitasking==0.0.11
308
+ murmurhash==1.0.10
309
+ music21==9.1.0
310
+ mypy-extensions==1.0.0
311
+ namex==0.0.8
312
+ natsort==8.4.0
313
+ nbclassic==1.1.0
314
+ nbclient==0.10.0
315
+ nbconvert==7.16.4
316
+ nbformat==5.10.4
317
+ nest-asyncio==1.6.0
318
+ networkx==3.4.2
319
+ nibabel==5.3.2
320
+ ninja==1.11.1.1
321
+ nltk==3.9.1
322
+ notebook==6.5.5
323
+ notebook_shim==0.2.4
324
+ numba==0.60.0
325
+ numexpr==2.10.1
326
+ numpy==1.26.4
327
+ nvidia-cublas-cu12==12.6.3.3
328
+ nvidia-cuda-cupti-cu12==12.6.80
329
+ nvidia-cuda-nvcc-cu12==12.6.77
330
+ nvidia-cuda-runtime-cu12==12.6.77
331
+ nvidia-cudnn-cu12==9.5.1.17
332
+ nvidia-cufft-cu12==11.3.0.4
333
+ nvidia-curand-cu12==10.3.7.77
334
+ nvidia-cusolver-cu12==11.7.1.2
335
+ nvidia-cusparse-cu12==12.5.4.2
336
+ nvidia-nccl-cu12==2.23.4
337
+ nvidia-nvjitlink-cu12==12.6.77
338
+ nvtx==0.2.10
339
+ nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.10.0-py3-none-any.whl
340
+ oauth2client==4.1.3
341
+ oauthlib==3.2.2
342
+ onnx==1.17.0
343
+ openai==1.54.1
344
+ opencv-contrib-python==4.10.0.84
345
+ opencv-python==4.10.0.84
346
+ opencv-python-headless==4.10.0.84
347
+ openpyxl==3.1.5
348
+ opentelemetry-api==1.16.0
349
+ opentelemetry-sdk==1.16.0
350
+ opentelemetry-semantic-conventions==0.37b0
351
+ opt_einsum==3.4.0
352
+ optax==0.2.3
353
+ optree==0.13.0
354
+ orbax-checkpoint==0.6.4
355
+ orjson==3.10.10
356
+ osqp==0.6.7.post3
357
+ packaging==24.1
358
+ pacmap==0.7.3
359
+ pandas==2.2.2
360
+ pandas-datareader==0.10.0
361
+ pandas-gbq==0.24.0
362
+ pandas-stubs==2.2.2.240909
363
+ pandocfilters==1.5.1
364
+ panel==1.4.5
365
+ param==2.1.1
366
+ parso==0.8.4
367
+ parsy==2.1
368
+ partd==1.4.2
369
+ pathlib==1.0.1
370
+ patsy==0.5.6
371
+ peewee==3.17.7
372
+ peft==0.13.2
373
+ pexpect==4.9.0
374
+ pickleshare==0.7.5
375
+ pillow==10.4.0
376
+ platformdirs==4.3.6
377
+ plotly==5.24.1
378
+ plotnine==0.14.0
379
+ pluggy==1.5.0
380
+ polars==1.9.0
381
+ pooch==1.8.2
382
+ portpicker==1.5.2
383
+ preshed==3.0.9
384
+ prettytable==3.11.0
385
+ proglog==0.1.10
386
+ progressbar2==4.5.0
387
+ prometheus_client==0.21.0
388
+ promise==2.3
389
+ prompt_toolkit==3.0.48
390
+ propcache==0.2.0
391
+ prophet==1.1.6
392
+ proto-plus==1.25.0
393
+ protobuf==3.20.3
394
+ psutil==5.9.5
395
+ psycopg2==2.9.10
396
+ ptyprocess==0.7.0
397
+ py-cpuinfo==9.0.0
398
+ py4j==0.10.9.7
399
+ pyarrow==17.0.0
400
+ pyarrow-hotfix==0.6
401
+ pyasn1==0.6.1
402
+ pyasn1_modules==0.4.1
403
+ pycocotools==2.0.8
404
+ pycparser==2.22
405
+ pydantic==2.9.2
406
+ pydantic-settings==2.6.1
407
+ pydantic_core==2.23.4
408
+ pydata-google-auth==1.8.2
409
+ pydot==3.0.2
410
+ pydotplus==2.0.2
411
+ PyDrive==1.3.1
412
+ PyDrive2==1.20.0
413
+ pyerfa==2.0.1.4
414
+ pygame==2.6.1
415
+ pygit2==1.16.0
416
+ Pygments==2.18.0
417
+ PyGObject==3.42.1
418
+ PyJWT==2.9.0
419
+ pylibcudf-cu12 @ https://pypi.nvidia.com/pylibcudf-cu12/pylibcudf_cu12-24.10.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
420
+ pylibcugraph-cu12==24.10.0
421
+ pylibraft-cu12==24.10.0
422
+ pymc==5.17.0
423
+ pymystem3==0.2.0
424
+ pynvjitlink-cu12==0.4.0
425
+ pynvml==11.5.3
426
+ pyogrio==0.10.0
427
+ PyOpenGL==3.1.7
428
+ pyOpenSSL==24.2.1
429
+ pyparsing==3.2.0
430
+ pypdf==5.1.0
431
+ pyperclip==1.9.0
432
+ pyproj==3.7.0
433
+ pyshp==2.3.1
434
+ PySocks==1.7.1
435
+ pyspark==3.5.3
436
+ pytensor==2.25.5
437
+ pytest==7.4.4
438
+ python-apt==0.0.0
439
+ python-box==7.2.0
440
+ python-dateutil==2.8.2
441
+ python-dotenv==1.0.1
442
+ python-louvain==0.16
443
+ python-slugify==8.0.4
444
+ python-utils==3.9.0
445
+ pytz==2024.2
446
+ pyviz_comms==3.0.3
447
+ PyYAML==6.0.2
448
+ pyzmq==24.0.1
449
+ qdldl==0.1.7.post4
450
+ RAGatouille==0.0.8.post4
451
+ ratelim==0.1.6
452
+ referencing==0.35.1
453
+ regex==2024.9.11
454
+ requests==2.32.3
455
+ requests-oauthlib==1.3.1
456
+ requests-toolbelt==1.0.0
457
+ requirements-parser==0.9.0
458
+ rich==13.9.3
459
+ rmm-cu12==24.10.0
460
+ rpds-py==0.20.0
461
+ rpy2==3.4.2
462
+ rsa==4.9
463
+ safetensors==0.4.5
464
+ scikit-image==0.24.0
465
+ scikit-learn==1.5.2
466
+ scipy==1.13.1
467
+ scooby==0.10.0
468
+ scs==3.2.7
469
+ seaborn==0.13.2
470
+ SecretStorage==3.3.1
471
+ Send2Trash==1.8.3
472
+ sentence-transformers==2.7.0
473
+ sentencepiece==0.2.0
474
+ sentry-sdk==2.17.0
475
+ setproctitle==1.3.3
476
+ shap==0.46.0
477
+ shapely==2.0.6
478
+ shellingham==1.5.4
479
+ simple-parsing==0.1.6
480
+ six==1.16.0
481
+ sklearn-pandas==2.2.0
482
+ slicer==0.0.8
483
+ smart-open==7.0.5
484
+ smmap==5.0.1
485
+ sniffio==1.3.1
486
+ snowballstemmer==2.2.0
487
+ soundfile==0.12.1
488
+ soupsieve==2.6
489
+ soxr==0.5.0.post1
490
+ spacy==3.7.5
491
+ spacy-legacy==3.0.12
492
+ spacy-loggers==1.0.5
493
+ Sphinx==5.0.2
494
+ sphinxcontrib-applehelp==2.0.0
495
+ sphinxcontrib-devhelp==2.0.0
496
+ sphinxcontrib-htmlhelp==2.1.0
497
+ sphinxcontrib-jsmath==1.0.1
498
+ sphinxcontrib-qthelp==2.0.0
499
+ sphinxcontrib-serializinghtml==2.0.0
500
+ SQLAlchemy==2.0.35
501
+ sqlglot==25.1.0
502
+ sqlparse==0.5.1
503
+ srsly==2.4.8
504
+ stanio==0.5.1
505
+ statsmodels==0.14.4
506
+ StrEnum==0.4.15
507
+ stringzilla==3.10.6
508
+ striprtf==0.0.26
509
+ sympy==1.13.1
510
+ tables==3.8.0
511
+ tabulate==0.9.0
512
+ tbb==2021.13.1
513
+ tcmlib==1.2.0
514
+ tenacity==8.5.0
515
+ tensorboard==2.17.0
516
+ tensorboard-data-server==0.7.2
517
+ tensorflow==2.17.0
518
+ tensorflow-datasets==4.9.6
519
+ tensorflow-hub==0.16.1
520
+ tensorflow-io-gcs-filesystem==0.37.1
521
+ tensorflow-metadata==1.16.1
522
+ tensorflow-probability==0.24.0
523
+ tensorstore==0.1.67
524
+ termcolor==2.5.0
525
+ terminado==0.18.1
526
+ text-unidecode==1.3
527
+ textblob==0.17.1
528
+ tf-slim==1.1.0
529
+ tf_keras==2.17.0
530
+ thinc==8.2.5
531
+ threadpoolctl==3.5.0
532
+ tifffile==2024.9.20
533
+ tiktoken==0.8.0
534
+ timm==1.0.11
535
+ tinycss2==1.4.0
536
+ tokenizers==0.19.1
537
+ toml==0.10.2
538
+ tomli==2.0.2
539
+ toolz==0.12.1
540
+ torch @ https://download.pytorch.org/whl/cu121_full/torch-2.5.0%2Bcu121-cp310-cp310-linux_x86_64.whl
541
+ torchaudio @ https://download.pytorch.org/whl/cu121_full/torchaudio-2.5.0%2Bcu121-cp310-cp310-linux_x86_64.whl
542
+ torchsummary==1.5.1
543
+ torchvision @ https://download.pytorch.org/whl/cu121_full/torchvision-0.20.0%2Bcu121-cp310-cp310-linux_x86_64.whl
544
+ tornado==6.3.3
545
+ tqdm==4.66.6
546
+ traitlets==5.7.1
547
+ traittypes==0.2.1
548
+ transformers==4.44.2
549
+ tweepy==4.14.0
550
+ typeguard==4.4.0
551
+ typer==0.12.5
552
+ types-pytz==2024.2.0.20241003
553
+ types-setuptools==75.2.0.20241025
554
+ typing-inspect==0.9.0
555
+ typing_extensions==4.12.2
556
+ tzdata==2024.2
557
+ tzlocal==5.2
558
+ uc-micro-py==1.0.3
559
+ ujson==5.10.0
560
+ umf==0.9.0
561
+ uritemplate==4.1.1
562
+ urllib3==2.2.3
563
+ vega-datasets==0.9.0
564
+ voyager==2.0.9
565
+ wadllib==1.3.6
566
+ wandb==0.18.5
567
+ wasabi==1.1.3
568
+ wcwidth==0.2.13
569
+ weasel==0.4.1
570
+ webcolors==24.8.0
571
+ webencodings==0.5.1
572
+ websocket-client==1.8.0
573
+ Werkzeug==3.0.6
574
+ widgetsnbextension==3.6.10
575
+ wordcloud==1.9.3
576
+ wrapt==1.16.0
577
+ xarray==2024.10.0
578
+ xarray-einstats==0.8.0
579
+ xgboost==2.1.2
580
+ xlrd==2.0.1
581
+ xxhash==3.5.0
582
+ xyzservices==2024.9.0
583
+ yarl==1.17.0
584
+ yellowbrick==1.5
585
+ yfinance==0.2.48
586
+ zipp==3.20.2
src/__init__.py ADDED
File without changes
src/app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Databricks notebook source
2
+ import streamlit as st
3
+ import os
4
+ import yaml
5
+ from dotenv import load_dotenv
6
+ from src.generator import answer_with_rag
7
+ from ragatouille import RAGPretrainedModel
8
+ from src.data_preparation import split_documents
9
+ from transformers import pipeline
10
+ from langchain_community.document_loaders import PyPDFLoader
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+ from src.retriever import init_vectorDB_from_doc, retriever
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
14
+ from langchain_community.vectorstores import FAISS
15
+ import faiss
16
+ def load_config():
17
+ with open("./src/config.yml","r") as file_object:
18
+ try:
19
+ cfg=yaml.safe_load(file_object)
20
+
21
+ except yaml.YAMLError as exc:
22
+ logger.error(str(exc))
23
+ raise
24
+ else:
25
+ return cfg
26
+
27
+ cfg= load_config()
28
+ load_dotenv("./src/.env")
29
+
30
+ EMBEDDING_MODEL_NAME=cfg['EMBEDDING_MODEL_NAME']
31
+ DATA_FILE_PATH=cfg['DATA_FILE_PATH']
32
+ READER_MODEL_NAME=cfg['READER_MODEL_NAME']
33
+ RERANKER_MODEL_NAME=cfg['RERANKER_MODEL_NAME']
34
+ VECTORDB_PATH=cfg['VECTORDB_PATH']
35
+ if __name__ == "__main__":
36
+ st.title("RAG App to query le College de Pédiatrie")
37
+
38
+ user_query = st.text_input("Entrez votre question:")
39
+
40
+
41
+ # Initialize the retriever and LLM
42
+
43
+ loader = PyPDFLoader(DATA_FILE_PATH)
44
+ #loader = PyPDFDirectoryLoader(DATA_FILE_PATH)
45
+ raw_document_base = loader.load()
46
+ MARKDOWN_SEPARATORS = [
47
+ "\n#{1,6} ",
48
+ "```\n",
49
+ "\n\\*\\*\\*+\n",
50
+ "\n---+\n",
51
+ "\n___+\n",
52
+ "\n\n",
53
+ "\n",
54
+ " ",
55
+ "",]
56
+ docs_processed = split_documents(
57
+ 512, # We choose a chunk size adapted to our model
58
+ raw_document_base,
59
+ tokenizer_name=EMBEDDING_MODEL_NAME,
60
+ separator=MARKDOWN_SEPARATORS
61
+ )
62
+ embedding_model=init_embedding_model(EMBEDDING_MODEL_NAME)
63
+
64
+ if os.path.exists(VECTORDB_PATH):
65
+ new_vector_store = FAISS.load_local(
66
+ VECTORDB_PATH, embedding_model,
67
+ allow_dangerous_deserialization=True)
68
+ else:
69
+ KNOWLEDGE_VECTOR_DATABASE=init_vectorDB_from_doc(docs_processed, embedding_model)
70
+ KNOWLEDGE_VECTOR_DATABASE.save_local(VECTORDB_PATH)
71
+
72
+
73
+ if st.button("Get Answer"):
74
+ # Get the answer and relevant documents
75
+ bnb_config = BitsAndBytesConfig(
76
+ load_in_4bit=True,
77
+ bnb_4bit_use_double_quant=True,
78
+ bnb_4bit_quant_type="nf4",
79
+ bnb_4bit_compute_dtype=torch.bfloat16,
80
+ )
81
+ model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
82
+ tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
83
+
84
+ READER_LLM = pipeline(
85
+ model=model,
86
+ tokenizer=tokenizer,
87
+ task="text-generation",
88
+ do_sample=True,
89
+ temperature=0.2,
90
+ repetition_penalty=1.1,
91
+ return_full_text=False,
92
+ max_new_tokens=500,
93
+ )
94
+ RERANKER = RAGPretrainedModel.from_pretrained(RERANKER_MODEL_NAME)
95
+ num_doc_before_rerank=15
96
+ num_final_releveant_docs=5
97
+ answer, relevant_docs = answer_with_rag(query=user_query, READER_MODEL_NAME=READER_MODEL_NAME,embedding_model=embedding_model,vectorDB=KNOWLEDGE_VECTOR_DATABASE,reranker=RERANKER, llm=READER_LLM,num_doc_before_rerank=num_doc_before_rerank,num_final_relevant_docs=num_final_releveant_docs,rerank=True)
98
+ #print(answer)
99
+
100
+
101
+ # Display the answer
102
+ st.write("### Answer:")
103
+ st.write(answer)
104
+
105
+ # Display the relevant documents
106
+ st.write("### Relevant Documents:")
107
+ for i, doc in enumerate(relevant_docs):
108
+ st.write(f"Document {i}:\n{doc.text}")
src/config.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ EMBEDDING_MODEL_NAME: "OrdalieTech/Solon-embeddings-large-0.1"
2
+ READER_MODEL_NAME: "mistralai/Mistral-7B-Instruct-v0.3"
3
+ RERANKER_MODEL_NAME: "colbert-ir/colbertv2.0"
4
+ VECTORDB_PATH: "./vectorDB/KNOWLEDGE_VECTOR_DATABASE_index"
5
+ DATA_FILE_PATH: "./data/College_pediatrie_2024.pdf"
src/data_preparation.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Databricks notebook source
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from transformers import AutoTokenizer
4
+ from sentence_transformers import SentenceTransformer
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline
9
+ from transformers import AutoTokenizer, pipeline
10
+ from langchain.docstore.document import Document as LangchainDocument
11
+ from typing import List, Optional
12
+ #from langchain import HuggingFacePipeline
13
+ #from langchain.chains import RetrievalQA
14
+
15
+ EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1"
16
+
17
+
18
+ def split_documents(
19
+ chunk_size: int,
20
+ knowledge_base: List[LangchainDocument],
21
+ tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
22
+ separator:List[str]=None,
23
+ ) -> List[LangchainDocument]:
24
+ """
25
+ Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
26
+ """
27
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
28
+ AutoTokenizer.from_pretrained(tokenizer_name),
29
+ chunk_size=chunk_size,
30
+ chunk_overlap=int(chunk_size / 10),
31
+ add_start_index=True,
32
+ strip_whitespace=True,
33
+ separators=separator,
34
+ )
35
+
36
+ docs_processed = []
37
+ for doc in knowledge_base:
38
+ docs_processed += text_splitter.split_documents([doc])
39
+
40
+ # Remove duplicates
41
+ unique_texts = {}
42
+ docs_processed_unique = []
43
+ for doc in docs_processed:
44
+ if doc.page_content not in unique_texts:
45
+ unique_texts[doc.page_content] = True
46
+ docs_processed_unique.append(doc)
47
+
48
+ return docs_processed_unique
src/embeddings.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Databricks notebook source
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
+ from langchain_community.vectorstores.utils import DistanceStrategy
4
+
5
+
6
+ def init_embedding_model(EMBEDDING_MODEL_NAME: str):
7
+ embedding_model = HuggingFaceEmbeddings(
8
+ model_name=EMBEDDING_MODEL_NAME,
9
+ multi_process=True,
10
+ model_kwargs={"device": "cuda"},
11
+ # model_kwargs={"device": "cpu"},
12
+ # Set `True` for cosine similarity
13
+ encode_kwargs={"normalize_embeddings": True},
14
+ )
15
+ return embedding_model
src/generator.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Databricks notebook source
2
+ from src.retriever import init_vectorDB_from_doc, retriever
3
+
4
+ from transformers import AutoTokenizer, pipeline
5
+ from typing import List,Optional, Tuple # import the Tuple type
6
+ from langchain.docstore.document import Document as LangchainDocument
7
+ def promt_template(query: str,READER_MODEL_NAME:str,context:str):
8
+ prompt_in_chat_format = [
9
+ {
10
+ "role": "system",
11
+ "content": """Using the information contained in the context,
12
+ give a comprehensive answer to the question.
13
+ Respond only to the question asked, response should be concise and relevant to the question.
14
+ Provide the number of the source document when relevant.If the nswer cannot be deduced from the context, do not give an answer. Please answer in french""",
15
+ },
16
+ {
17
+ "role": "user",
18
+ "content": """Context:
19
+ {context}
20
+
21
+ ---
22
+ Now here is the question you need to answer.
23
+
24
+ Question: {query}""",
25
+ },
26
+ ]
27
+ tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)
28
+ RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
29
+ prompt_in_chat_format, tokenize=False, add_generation_prompt=True)
30
+ return RAG_PROMPT_TEMPLATE
31
+
32
+ def answer_with_rag(
33
+ query: str,embedding_model, vectorDB: FAISS,READER_MODEL_NAME:str,
34
+ reranker,llm: pipeline, num_doc_before_rerank: int = 5,
35
+ num_final_relevant_docs: int = 5,
36
+ rerank: bool = True
37
+
38
+ ) -> Tuple[str, List[LangchainDocument]]:
39
+ # Build the final prompt
40
+ relevant_docs= retriever(query,vectorDB,reranker,num_doc_before_rerank,num_final_relevant_docs,rerank)
41
+ context = "\nExtracted documents:\n"
42
+ context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])
43
+ #print("=> Context:")
44
+ #print(context)
45
+ RAG_PROMPT_TEMPLATE = promt_template(query,READER_MODEL_NAME,context)
46
+ final_prompt =RAG_PROMPT_TEMPLATE.format(query=query, context=context,READER_MODEL_NAME=READER_MODEL_NAME)
47
+ print("=> Final prompt:")
48
+ #print(final_prompt)
49
+ # Redact an answer
50
+ print("=> Generating answer...")
51
+ answer = llm(final_prompt)[0]["generated_text"]
52
+
53
+ return answer, relevant_docs
src/retriever.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Databricks notebook source
2
+ from typing import List,Optional
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.embeddings.base import Embeddings
5
+ from langchain_community.vectorstores.utils import DistanceStrategy
6
+ from transformers import RagRetriever
7
+ from langchain.docstore.document import Document as LangchainDocument
8
+
9
+ def init_vectorDB_from_doc(documents:List[LangchainDocument], embedding_model: Embeddings) -> FAISS:
10
+ KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
11
+ documents, embedding_model, distance_strategy=DistanceStrategy.COSINE
12
+ )
13
+ return KNOWLEDGE_VECTOR_DATABASE
14
+
15
+
16
+
17
+
18
+
19
+ def retriever(
20
+ user_query: str,
21
+ vectorDB: FAISS,
22
+ reranker = None,
23
+ num_doc_before_rerank: int = 5,
24
+ num_final_relevant_docs: int = 5,
25
+ rerank: bool = True
26
+ ) -> List[str]:
27
+ relevant_docs = vectorDB.similarity_search(query=user_query, k=num_doc_before_rerank)
28
+ relevant_docs = [doc.page_content for doc in relevant_docs] # Keep only the text
29
+ print("=> Relevant documents:")
30
+ print(relevant_docs)
31
+ if rerank and reranker:
32
+ # Reranking documents
33
+ relevant_docs = reranker.rerank(user_query, relevant_docs, k=num_final_relevant_docs)
34
+ final_relevant_docs = [doc["content"] for doc in relevant_docs]
35
+ print("=> Reranked documents:")
36
+ print(final_relevant_docs)
37
+ else:
38
+ final_relevant_docs = relevant_docs
39
+ print("=> Final relevant documents:")
40
+ print(final_relevant_docs)
41
+ return final_relevant_docs