Upload 15 files
Browse files- .github/workflows/update_space.yml +28 -0
- .gitignore +167 -0
- .gradio/certificate.pem +31 -0
- README.md +42 -12
- app.py +86 -0
- chunker.py +32 -0
- docs/A Boon or Bane for Students.txt +33 -0
- docs/Black Friday.txt +15 -0
- docs/Computer and its Uses for School Students and Children.txt +28 -0
- docs/How LED lights can save sea turtles’ lives.txt +9 -0
- docs/Modern-day slavery.txt +20 -0
- docs/Underwater search for lost love.txt +13 -0
- rag.py +49 -0
- requirements.txt +0 -0
- retriever.py +69 -0
.github/workflows/update_space.yml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Run Python script
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
build:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout
|
14 |
+
uses: actions/checkout@v2
|
15 |
+
|
16 |
+
- name: Set up Python
|
17 |
+
uses: actions/setup-python@v2
|
18 |
+
with:
|
19 |
+
python-version: '3.9'
|
20 |
+
|
21 |
+
- name: Install Gradio
|
22 |
+
run: python -m pip install gradio
|
23 |
+
|
24 |
+
- name: Log in to Hugging Face
|
25 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
26 |
+
|
27 |
+
- name: Deploy to Spaces
|
28 |
+
run: gradio deploy
|
.gitignore
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
# C extensions
|
6 |
+
*.so
|
7 |
+
|
8 |
+
# Distribution / packaging
|
9 |
+
.Python
|
10 |
+
build/
|
11 |
+
develop-eggs/
|
12 |
+
dist/
|
13 |
+
downloads/
|
14 |
+
eggs/
|
15 |
+
.eggs/
|
16 |
+
lib/
|
17 |
+
lib64/
|
18 |
+
parts/
|
19 |
+
sdist/
|
20 |
+
var/
|
21 |
+
wheels/
|
22 |
+
share/python-wheels/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
MANIFEST
|
27 |
+
|
28 |
+
# PyInstaller
|
29 |
+
# Usually these files are written by a python script from a template
|
30 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
31 |
+
*.manifest
|
32 |
+
*.spec
|
33 |
+
|
34 |
+
# Installer logs
|
35 |
+
pip-log.txt
|
36 |
+
pip-delete-this-directory.txt
|
37 |
+
|
38 |
+
# Unit test / coverage reports
|
39 |
+
htmlcov/
|
40 |
+
.tox/
|
41 |
+
.nox/
|
42 |
+
.coverage
|
43 |
+
.coverage.*
|
44 |
+
.cache
|
45 |
+
nosetests.xml
|
46 |
+
coverage.xml
|
47 |
+
*.cover
|
48 |
+
*.py,cover
|
49 |
+
.hypothesis/
|
50 |
+
.pytest_cache/
|
51 |
+
cover/
|
52 |
+
|
53 |
+
# Translations
|
54 |
+
*.mo
|
55 |
+
*.pot
|
56 |
+
|
57 |
+
# Django stuff:
|
58 |
+
*.log
|
59 |
+
local_settings.py
|
60 |
+
db.sqlite3
|
61 |
+
db.sqlite3-journal
|
62 |
+
|
63 |
+
# Flask stuff:
|
64 |
+
instance/
|
65 |
+
.webassets-cache
|
66 |
+
|
67 |
+
# Scrapy stuff:
|
68 |
+
.scrapy
|
69 |
+
|
70 |
+
# Sphinx documentation
|
71 |
+
docs/_build/
|
72 |
+
|
73 |
+
# PyBuilder
|
74 |
+
.pybuilder/
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
# For a library or package, you might want to ignore these files since the code is
|
86 |
+
# intended to run in multiple environments; otherwise, check them in:
|
87 |
+
# .python-version
|
88 |
+
|
89 |
+
# pipenv
|
90 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
91 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
92 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
93 |
+
# install all needed dependencies.
|
94 |
+
#Pipfile.lock
|
95 |
+
|
96 |
+
# UV
|
97 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
98 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
99 |
+
# commonly ignored for libraries.
|
100 |
+
#uv.lock
|
101 |
+
|
102 |
+
# poetry
|
103 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
104 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
105 |
+
# commonly ignored for libraries.
|
106 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
107 |
+
#poetry.lock
|
108 |
+
|
109 |
+
# pdm
|
110 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
111 |
+
#pdm.lock
|
112 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
113 |
+
# in version control.
|
114 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
115 |
+
.pdm.toml
|
116 |
+
.pdm-python
|
117 |
+
.pdm-build/
|
118 |
+
|
119 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
120 |
+
__pypackages__/
|
121 |
+
|
122 |
+
# Celery stuff
|
123 |
+
celerybeat-schedule
|
124 |
+
celerybeat.pid
|
125 |
+
|
126 |
+
# SageMath parsed files
|
127 |
+
*.sage.py
|
128 |
+
|
129 |
+
# Environments
|
130 |
+
.env
|
131 |
+
.venv
|
132 |
+
env/
|
133 |
+
venv/
|
134 |
+
ENV/
|
135 |
+
env.bak/
|
136 |
+
venv.bak/
|
137 |
+
|
138 |
+
# Spyder project settings
|
139 |
+
.spyderproject
|
140 |
+
.spyproject
|
141 |
+
|
142 |
+
# Rope project settings
|
143 |
+
.ropeproject
|
144 |
+
|
145 |
+
# mkdocs documentation
|
146 |
+
/site
|
147 |
+
|
148 |
+
# mypy
|
149 |
+
.mypy_cache/
|
150 |
+
.dmypy.json
|
151 |
+
dmypy.json
|
152 |
+
|
153 |
+
# Pyre type checker
|
154 |
+
.pyre/
|
155 |
+
|
156 |
+
# pytype static type analyzer
|
157 |
+
.pytype/
|
158 |
+
|
159 |
+
# Cython debug symbols
|
160 |
+
cython_debug/
|
161 |
+
|
162 |
+
# PyCharm
|
163 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
164 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
165 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
166 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
167 |
+
#.idea/
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
@@ -1,12 +1,42 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# RAG. Question answering bot.
|
2 |
+

|
3 |
+
|
4 |
+
### Topics
|
5 |
+
- [Data source](#data-source) ✔️
|
6 |
+
- [Chunking](#chunking) ✔️
|
7 |
+
- [LLM](#llm) ✔️
|
8 |
+
- [Retriever](#retriever) ✔️
|
9 |
+
- [Reranker](#reranker) ✔️
|
10 |
+
- [Citation](#citation) ❌
|
11 |
+
- [Web UI and deployment](#web-ui-and-deployment) ✔️
|
12 |
+
|
13 |
+
|
14 |
+
## Data source
|
15 |
+
|
16 |
+
I used documents found on the Internet. You can take a look at them in **docs** directory, and you can ask questions based on that context. There also is possibility to upload your txt file and use it as a context.
|
17 |
+
|
18 |
+
## Chunking
|
19 |
+
Chunking was performed using the same method explained in live-coding session. No other libraries were involved.
|
20 |
+
|
21 |
+
## LLM
|
22 |
+
As LLM I used pretrained model [llama3-70b-8192](https://huggingface.co/Groq/Llama-3-Groq-70B-Tool-Use).
|
23 |
+
|
24 |
+
## Retriever
|
25 |
+
Retrieving can be performed in three different ways. You can either use BM25 retriever or a dense retriever by calculating semantic scores. Using both of them in hybrid approach is also an option.
|
26 |
+
|
27 |
+
Dense retriever used in this lab - sentence-transformers/all-distilroberta-v1.
|
28 |
+
|
29 |
+
#### Here's an example when dense retriever works better than BM25:
|
30 |
+

|
31 |
+

|
32 |
+
|
33 |
+
|
34 |
+
## Reranker
|
35 |
+
|
36 |
+
As a reranker there was used cross encoder cross-encoder/stsb-roberta-base. It may be not efficient in my case, as far as amount of documents is quite small, so it takes time to process the data, but does not improve the process of extracting context.
|
37 |
+
## Citation
|
38 |
+
|
39 |
+
Isn't implemented
|
40 |
+
## Web UI and deployment
|
41 |
+
I used gradio lib for demo and hosting.
|
42 |
+
|
app.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rag import QuestionAnsweringBot
|
3 |
+
from rag import read_docs, dir_path
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
|
7 |
+
def upload_file(files) -> List[str]:
|
8 |
+
file_paths = [file.name for file in files]
|
9 |
+
return file_paths
|
10 |
+
|
11 |
+
|
12 |
+
def read_uploaded_docs(uploaded_docs: List[str]) -> List[str]:
|
13 |
+
docs = []
|
14 |
+
for path in uploaded_docs:
|
15 |
+
with open(path, 'r', encoding='utf-8') as file:
|
16 |
+
text = file.read()
|
17 |
+
docs.append(text)
|
18 |
+
return docs
|
19 |
+
|
20 |
+
|
21 |
+
def answer_question(docs, query: str, score: str, api_key):
|
22 |
+
if not api_key:
|
23 |
+
return "API key needed to proceed."
|
24 |
+
|
25 |
+
docs = read_uploaded_docs(docs) if docs else read_docs(dir_path=dir_path)
|
26 |
+
|
27 |
+
match score:
|
28 |
+
case 'BM25': bot = QuestionAnsweringBot(docs, 0, api_key)
|
29 |
+
case 'Dense': bot = QuestionAnsweringBot(docs, 1, api_key)
|
30 |
+
case 'Both': bot = QuestionAnsweringBot(docs, 2, api_key)
|
31 |
+
|
32 |
+
answer = bot.answer_question(question=query)
|
33 |
+
return answer
|
34 |
+
|
35 |
+
|
36 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
37 |
+
gr.Markdown(
|
38 |
+
"""
|
39 |
+
# Question Answering Bot
|
40 |
+
|
41 |
+
This bot uses default doucuments or those you provided to answer questions based on their content.
|
42 |
+
You can select from the following scoring methods for retrieving the context:
|
43 |
+
- **BM25 scores**
|
44 |
+
- **Dense retriever**
|
45 |
+
- **Hybrid approach** (both BM25 and dense retriever combined).
|
46 |
+
|
47 |
+
## Instructions
|
48 |
+
- Enter your **Groq API Key** in the textbox below.
|
49 |
+
- The API key can be generated using [this link](https://console.groq.com/keys).
|
50 |
+
- Input your query and select the scoring method to receive an answer.
|
51 |
+
- Ask questions directly based on files given in **docs** directory in my [github repository](https://github.com/olenkapyrih/RAG/tree/master)
|
52 |
+
- Or upload your files and use them as context. Just remember that the only allowed format is **.txt**
|
53 |
+
"""
|
54 |
+
)
|
55 |
+
|
56 |
+
uploaded_docs = gr.File(
|
57 |
+
label="Upload Documents",
|
58 |
+
file_types=[".txt"],
|
59 |
+
file_count="multiple"
|
60 |
+
)
|
61 |
+
|
62 |
+
api_key = gr.Textbox(
|
63 |
+
label='Groq API Key',
|
64 |
+
placeholder="Enter your Groq API Key securely here.",
|
65 |
+
type="password"
|
66 |
+
)
|
67 |
+
|
68 |
+
query = gr.Textbox(
|
69 |
+
label='Query',
|
70 |
+
placeholder="Ask a question. \
|
71 |
+
Ex: Does a slavery still exist? Tell me about it."
|
72 |
+
)
|
73 |
+
|
74 |
+
score = gr.Radio(
|
75 |
+
choices=["BM25", "Dense", "Both"],
|
76 |
+
label="Select Scoring Method",
|
77 |
+
value="Both"
|
78 |
+
)
|
79 |
+
|
80 |
+
|
81 |
+
outp = gr.Textbox(label='Answer', lines=6)
|
82 |
+
button = gr.Button(value='Submit', variant='primary', key='enter')
|
83 |
+
button.click(answer_question, inputs=[uploaded_docs, query, score, api_key], outputs=outp, show_progress=True)
|
84 |
+
|
85 |
+
|
86 |
+
demo.launch(share=True)
|
chunker.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Generator, List
|
2 |
+
|
3 |
+
|
4 |
+
def chunk_document(
|
5 |
+
doc: str,
|
6 |
+
desired_chunk_size: int,
|
7 |
+
max_chunk_size: int
|
8 |
+
) -> Generator[str, None, None]:
|
9 |
+
chunk = ''
|
10 |
+
for line in doc.splitlines():
|
11 |
+
chunk += line + '\n'
|
12 |
+
if len(chunk) >= desired_chunk_size:
|
13 |
+
yield chunk[:max_chunk_size]
|
14 |
+
chunk = ''
|
15 |
+
if chunk:
|
16 |
+
yield chunk
|
17 |
+
|
18 |
+
|
19 |
+
def chunk_documents(
|
20 |
+
docs: List[str],
|
21 |
+
desired_chunk_size: int = 500,
|
22 |
+
max_chunk_size: int = 3000
|
23 |
+
) -> List[str]:
|
24 |
+
chunks = []
|
25 |
+
for doc in docs:
|
26 |
+
chunks += list(chunk_document(
|
27 |
+
doc=doc,
|
28 |
+
desired_chunk_size=desired_chunk_size,
|
29 |
+
max_chunk_size=max_chunk_size
|
30 |
+
))
|
31 |
+
|
32 |
+
return chunks
|
docs/A Boon or Bane for Students.txt
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
In this essay on technology, we are going to discuss what technology is, what are its uses, and also what technology can do? First of all, technology refers to the use of technical and scientific knowledge to create, monitor, and design machinery. Also, technology helps in making other goods that aid mankind.
|
2 |
+
|
3 |
+
Essay on Technology – A Boon or Bane?
|
4 |
+
Experts are debating on this topic for years. Also, the technology covered a long way to make human life easier but the negative aspect of it can’t be ignored. Over the years technological advancement has caused a severe rise in pollution. Also, pollution has become a major cause of many health issues. Besides, it has cut off people from society rather than connecting them. Above all, it has taken away many jobs from the workers class.
|
5 |
+
Familiarity between Technology and Science
|
6 |
+
As they are completely different fields but they are interdependent on each other. Also, it is due to science contribution we can create new innovation and build new technological tools. Apart from that, the research conducted in laboratories contributes a lot to the development of technologies. On the other hand, technology extends the agenda of science.
|
7 |
+
|
8 |
+
Vital Part of our Life
|
9 |
+
Regularly evolving technology has become an important part of our lives. Also, newer technologies are taking the market by storm and the people are getting used to them in no time. Above all, technological advancement has led to the growth and development of nations.
|
10 |
+
|
11 |
+
Negative Aspect of Technology
|
12 |
+
Although technology is a good thing, everything has two sides. Technology also has two sides one is good and the other is bad. Here are some negative aspects of technology that we are going to discuss.
|
13 |
+
Pollution
|
14 |
+
With new technology the industrialization increases which give birth to many pollutions like air, water, soil, and noise. Also, they cause many health-related issues in animals, birds, and human beings.
|
15 |
+
|
16 |
+
Exhaustion of Natural Resources
|
17 |
+
New technology requires new resources for which the balance is disturbed. Eventually, this will lead to over-exploitation of natural resources which ultimately disturbs the balance of nature.
|
18 |
+
|
19 |
+
Unemployment
|
20 |
+
A single machine can replace many workers. Also, machines can do work at a constant pace for several hours or days without stopping. Due to this, many workers lost their job which ultimately increases unemployment.
|
21 |
+
|
22 |
+
Types of Technology
|
23 |
+
Generally, we judge technology on the same scale but in reality, technology is divided into various types. This includes information technology, industrial technology, architectural technology, creative technology and many more. Let’s discuss these technologies in brief.
|
24 |
+
|
25 |
+
Industrial Technology
|
26 |
+
This technology organizes engineering and manufacturing technology for the manufacturing of machines. Also, this makes the production process easier and convenient.
|
27 |
+
|
28 |
+
Creative Technology
|
29 |
+
This process includes art, advertising, and product design which are made with the help of software. Also, it comprises of 3D printers, virtual reality, computer graphics, and other wearable technologies.
|
30 |
+
|
31 |
+
Information Technology
|
32 |
+
This technology involves the use of telecommunication and computer to send, receive and store information. Internet is the best example of Information technology.
|
33 |
+
Today, everything we use in our daily life is a gift of technology and without which we cannot imagine our lives. Also, we cannot refuse the facts that it has caused severe damage to our surroundings.
|
docs/Black Friday.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Every year in November, people look for bargains on Black Friday. But did you know that the same day is also Buy Nothing Day
|
2 |
+
What is Black Friday?
|
3 |
+
Black Friday is the day after the American holiday of Thanksgiving, which is celebrated on the fourth Thursday of November. Because it is a holiday in the United States, it has long been a popular day for consumers to start shopping for Christmas. Over the last 20 years big retailers have started to offer discounts and bargains on this day, and it has become more and more popular. Last year, people in the USA spent an estimated $54.7 billion between Black Friday and Cyber Monday (the Monday after Thanksgiving, when people often buy more online). The idea of Black Friday has also spread around the world. For example, in 2017, people in the UK spent the equivalent of $10.3 billion, in Germany $7.6 billion and in France $6.2 billion.
|
4 |
+
|
5 |
+
Is Black Friday out of control?
|
6 |
+
Many of us love to get a bargain, but some feel that events like Black Friday encourage people to buy things that they don’t really need and can’t afford. Many people seem to completely lose control of both their spending and their tempers. It is easy to find video online of customers physically fighting each other over bargains. It is also argued that Black Friday is bad for small shopkeepers, who cannot afford to offer the kinds of price cuts that the big companies can.
|
7 |
+
|
8 |
+
What’s the alternative to Black Friday?
|
9 |
+
Instead of taking the opportunity to buy as much as possible on Black Friday, you could do the opposite and buy absolutely nothing. Since 1997, Buy Nothing Day has been held on the same day as Black Friday. The rules are simple. Just don’t buy anything at all for 24 hours. Many people are surprised how difficult this actually is. The aim is to make people think more about their spending and to make better decisions about what they buy and where they buy it from.
|
10 |
+
|
11 |
+
Ethical spending
|
12 |
+
As well as spending less and not buying unnecessary items, Buy Nothing Day aims to raise awareness of how to be a more ethical consumer. For example, you can avoid buying ‘fast fashion’, that is, very cheap clothes that are worn a few times before being thrown away. Or you could decide not to automatically upgrade your mobile at the end of a contract. These kinds of decisions can help to protect the environment as well as saving you money.
|
13 |
+
|
14 |
+
What else can you do on Buy Nothing Day?
|
15 |
+
Some people carry out protests at shopping centres. Others avoid the shops completely and go for a walk in nature instead. Another alternative, the Buy Nothing Coat Exchange, is an idea which is spreading. People donate winter coats throughout November and anyone who needs one can come and take one on Buy Nothing Day.
|
docs/Computer and its Uses for School Students and Children.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
In this essay on computer, we are going to discuss some useful things about computers. The modern-day computer has become an important part of our daily life. Also, their usage has increased much fold during the last decade. Nowadays, they use the computer in every office whether private or government. Mankind is using computers for over many decades now. Also, they are used in many fields like agriculture, designing, machinery making, defense and many more. Above all, they have revolutionized the whole world.
|
2 |
+
|
3 |
+
History of Computers
|
4 |
+
It is very difficult to find the exact origin of computers. But according to some experts computer exists at the time of world war-II. Also, at that time they were used for keeping data. But, it was for only government use and not for public use. Above all, in the beginning, the computer was a very large and heavy machine.
|
5 |
+
|
6 |
+
Working of a Computer
|
7 |
+
The computer runs on a three-step cycle namely input, process, and output. Also, the computer follows this cycle in every process it was asked to do. In simple words, the process can be explained in this way. The data which we feed into the computer is input, the work CPU do is process and the result which the computer give is output.
|
8 |
+
|
9 |
+
Components and Types of Computer
|
10 |
+
The simple computer basically consists of CPU, monitor, mouse, and keyboard. Also, there are hundreds of other computer parts that can be attached to it. These other parts include a printer, laser pen, scanner, etc.
|
11 |
+
|
12 |
+
The computer is categorized into many different types like supercomputers, mainframes, personal computers (desktop), PDAs, laptop, etc. The mobile phone is also a type of computer because it fulfills all the criteria of being a computer.
|
13 |
+
Uses of Computer in Various Fields
|
14 |
+
As the usage of computer increased it became a necessity for almost every field to use computers for their operations. Also, they have made working and sorting things easier. Below we are mentioning some of the important fields that use a computer in their daily operation.
|
15 |
+
|
16 |
+
Medical Field
|
17 |
+
They use computers to diagnose diseases, run tests and for finding the cure for deadly diseases. Also, they are able to find a cure for many diseases because of computers.
|
18 |
+
|
19 |
+
Research
|
20 |
+
Whether it’s scientific research, space research or any social research computers help in all of them. Also, due to them, we are able to keep a check on the environment, space, and society. Space research helped us to explore the galaxies. While scientific research has helped us to locate resources and various other useful resources from the earth.
|
21 |
+
|
22 |
+
Defense
|
23 |
+
For any country, his defence is most important for the safety and security of its people. Also, computer in this field helps the country’s security agencies to detect a threat which can be harmful in the future. Above all the defense industry use them to keep surveillance on our enemy.
|
24 |
+
|
25 |
+
Threats from a Computer
|
26 |
+
Computers have become a necessity also, they have become a threat too. This is due to hackers who steal your private data and leak them on internet. Also, anyone can access this data. Apart from that, there are other threats like viruses, spams, bug and many other problems.
|
27 |
+
|
28 |
+
The computer is a very important machine that has become a useful part of our life. Also, the computers have twin-faces on one side it’s a boon and on the other side, it’s a bane. Its uses completely depend upon you. Apart from that, a day in the future will come when human civilization won’t be able to survive without computers as we depend on them too much. Till now it is a great discovery of mankind that has helped in saving thousands and millions of lives.
|
docs/How LED lights can save sea turtles’ lives.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Sea turtles are amazing animals that have lived in our oceans for millions of years. However, today, they face many dangers. One of these comes from non-natural light. When baby sea turtles, also called hatchlings, come out of their eggs, they need to find the ocean quickly. They usually do this by following the natural light of the Moon and stars reflecting off the water. But in many places, artificial lights – from streets, buildings, and homes – are much brighter than the Moon. These lights can confuse baby sea turtles and make it hard for them to find their way to the ocean. This causes them to lose their way and head toward the land instead of the sea. When this happens, the baby turtles can get lost, dehydrated, or even be eaten by other animals.
|
2 |
+
|
3 |
+
Although artificial light is usually a problem for sea turtles, we don’t have to live in the dark to protect them. Research shows that using special artificial lights, placed low to the ground and slightly covered so they can’t be seen from the beach, reduces the chances of sea turtles getting confused. These turtle-friendly lights are also better for people as they can improve visibility while driving by reducing the shine on the car windows. If you live near the coast or are visiting a beach where sea turtles live, you can help. Use lights that are not bright and low to the ground. Close curtains at night to reduce the amount of light coming from inside buildings.
|
4 |
+
|
5 |
+
Sea turtles have another enemy. Every year, hundreds of thousands of these sea animals are caught by large fishing boats in their fishing nets by accident – this means six to eight turtles daily for each boat in Mexico alone. Surprisingly, in this case, artificial lights can be helpful for sea turtles.
|
6 |
+
|
7 |
+
Studies show that turtles use their sight to find food, but when swimming underwater at night, it’s hard to see the fishing net. So, different organisations have developed fishing nets with LED lights. By adding lights to fishing nets, scientists have found a way to prevent turtles and other animals from getting caught in the nets, reducing the number of unwanted catches by 60% to 95% without lowering the amount of fish caught.
|
8 |
+
|
9 |
+
Using LED lights on fishing nets doesn’t just help sea turtles. When sea animals get stuck in the net by mistake, they might damage it. It costs time and money to remove sea turtles from their nets and to fix or replace the broken nets. In addition, LED lights are energy-efficient and last a long time. This new design of fishing nets reduces costs, making it a less expensive option.
|
docs/Modern-day slavery.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
December 2 is the United Nation's International Day for the Abolition of Slavery. Did you know that slavery still exists today and is probably much closer to you than you realise?
|
2 |
+
Blood and Earth
|
3 |
+
In his book Blood and Earth, Kevin Bales speaks with Ibrahim, a 23-year-old slave who has worked in a gold mine since he was nine. He is dying. His lungs are filled with liquid caused by the dust and bacteria in the mine. As their conversation ends, Ibrahim turns to Kevin Bales and says, 'I want to be remembered. When my story is written and your book is ready, will you send me a copy? I want to show it to others, to show them that I am not completely useless. I just want to show that something good can come out of my life.'
|
4 |
+
|
5 |
+
So what's the connection to you? As you read this article, you are probably using a smartphone, tablet, or laptop. Each device requires minerals – including gold. Perhaps the gold in your electronic device was mined by slaves.
|
6 |
+
Slavery today
|
7 |
+
According to the Global Slavery Index 2018, over 40 million people are victims of modern slavery, and of these, 15 million are in forced marriage. Slavery involves violence, physical or psychological, and control – often in the form of threats in order to generate profit. To quote Kevin Bales, 'Slavery is when one person controls another, uses violence to maintain that control and exploits them economically.' This violence may be physical and/or psychological, and the control may be verbal threats – but at the heart of slavery is exploitation and 'ownership' of another human being for profit. Forms of modern slavery include forced labour, human trafficking, commercial sexual exploitation, domestic servitude and forced marriage.
|
8 |
+
|
9 |
+
You might be surprised to see forced marriage included above. Sadly, forced marriage involves the same lack of choice, power imbalance, coercion and labour exploitation as other forms of slavery. This also includes forced child marriage – usually of girls, of 17 years or younger.
|
10 |
+
|
11 |
+
Slavery behind closed doors
|
12 |
+
Another form of slavery is domestic servitude. Across the globe, domestic workers, mostly women, migrate abroad to support their families back home. Employment agents in their country of origin promise a generous salary and good working conditions with a caring host family. This, however, may be far from reality. Domestic workers are sometimes forced to work long hours and their passports and mobile phones are taken away. In extreme cases, behind closed doors of private homes, they are locked up, starved, deprived of sleep and often physically and sexually abused. They are trapped, scared and unfamiliar with their new surroundings. Domestic servitude happens globally, including in the UK.
|
13 |
+
|
14 |
+
The power of consumer choice
|
15 |
+
Every item we buy has a back story. From electronics to textiles, from handmade carpets to coffee, tea and chocolate, each of these products might include child or adult slavery. Consider a product as innocent as chocolate. While the chocolate bar itself may have been produced in your country, the cocoa in the chocolate probably came from West Africa, where 60 per cent of the world's cocoa is produced. As you read this, thousands of children and adults live in slave-like conditions on cocoa farms. Unknowingly, your purchase might support slavery. However, consumer demand for ethically-sourced products and services can send a powerful message to producers. Imagine if we all refused to purchase goods that have a back story of slavery. Company sales, and therefore profits, would fall. Look around at items in your home and workplace and ask yourself the simple question, 'Where did this come from and who made it?'
|
16 |
+
|
17 |
+
Why didn't I learn about modern slavery at school?
|
18 |
+
Did you ever learn about modern slavery at school? History lessons may have included the horrific practice of slavery, however, it was probably considered something that was very much 'in the past'. But slavery still exists and it is the everyday reality for millions of people. It takes brave educators to raise awareness of the difficult, upsetting and invisible reality of modern slavery.
|
19 |
+
|
20 |
+
The good news is that thousands of individuals and anti-slavery organisations are taking action. One such organisation is The NO Project, which focuses specifically on the education of youth and young adults. 'Youth are the next generation of corporate leaders, policy makers and consumers,' says the founder of The NO Project. 'How we choose to spend our money says a lot about who we are. So, the question is – who are we? And remember, another time, in another place, that enslaved human being could be you.'
|
docs/Underwater search for lost love.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Yasuo Takamatsu met Yuko in 1988. Yasuo was a soldier and Yuko worked at a bank in Onagawa, Japan. They quickly fell in love. Yuko was gentle, modest, and enjoyed classical music and painting. On Friday, March 11, 2011, Yasuo drove Yuko to the bank. Later that day, a powerful earthquake hit the city, followed by a tsunami warning.
|
2 |
+
|
3 |
+
Yasuo was at a hospital with his mother when the earthquake happened. The main roads to Onagawa were blocked. He was driving back on the small back roads when Yuko texted, “Are you O.K.? I want to go home.” The tsunami reached Onagawa at 3:20 p.m., destroying buildings and killing people. The next morning, soldiers arrived to search for bodies. Yasuo searched for Yuko every day from morning till evening until June when he started a new job. Then he searched on weekends, always hoping not to find Yuko’s body.
|
4 |
+
|
5 |
+
A month after the tsunami, Yuko’s pink flip phone was found in the bank’s parking lot. An unsent text from 3:25 p.m. read, “So much tsunami.” Yasuo knew she had been alive until then. Other bank employees’ bodies were found later. One was found six weeks after the tsunami, another in September 2011, but Yasuo still searched for Yuko.
|
6 |
+
|
7 |
+
By September 2013, after two and a half years of searching on land, Yasuo decided to search the sea. He contacted a dive shop to learn how to dive. The instructor, Masayoshi Takahashi, organised dives to clean tsunami rubbish. Yasuo believed Takahashi could help find Yuko. Yasuo told him, “I want to learn to dive to find my wife.” On his first dive, the water was freezing. Yasuo was scared. He could have hurt himself or got caught by a rope, but that didn’t worry him. The water wasn’t clear, and that was the real danger. Takahashi told him not to touch the bottom to avoid moving the sand.
|
8 |
+
|
9 |
+
One day, Yasuo visited Masaaki Narita, who had lost his daughter, Emi, in the tsunami. Emi worked with Yuko at the bank. The women had gone to the bank’s roof but were swept away by the massive wave. Yasuo felt sorry for Narita and offered to look for Emi too. But Narita decided to dive himself. In February 2014, Yasuo introduced Narita to Takahashi.
|
10 |
+
|
11 |
+
In January 2016, Narita prepared for a dive. His wife, Hiromi, watched because she worried about him. The ocean was dangerous, and she didn’t want to lose him too. Narita said, “If I die, throw my ashes in the sea.” He dove, and after 35 minutes, resurfaced safely. Hiromi walked to her car and drove off. It was time to deliver rice balls and deep-fried chicken.
|
12 |
+
|
13 |
+
Despite all these efforts, Yasuo continued his search for Yuko, holding on to hope.
|
rag.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from groq import Client
|
2 |
+
from retriever import Retriever
|
3 |
+
from typing import List
|
4 |
+
import glob
|
5 |
+
|
6 |
+
dir_path = 'docs'
|
7 |
+
|
8 |
+
class QuestionAnsweringBot:
|
9 |
+
|
10 |
+
PROMPT = """
|
11 |
+
You are a helpful assistant that answers the questions.
|
12 |
+
|
13 |
+
Rules:
|
14 |
+
- Reply with answer only and nothing but answer.
|
15 |
+
- Say 'I don`t know' if you don`t know the answer.
|
16 |
+
- Use the provided context.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, docs: List[str], score: int, api_key) -> None:
|
20 |
+
self.retriever = Retriever(docs=docs, score=score)
|
21 |
+
self.client = Client(api_key=api_key)
|
22 |
+
|
23 |
+
def answer_question(self, question: str) -> str:
|
24 |
+
context = self.retriever.get_docs(query=question)
|
25 |
+
messages = [
|
26 |
+
{
|
27 |
+
"role": "system",
|
28 |
+
"content": self.PROMPT
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"role": "user",
|
32 |
+
"content": f"Context: {context}\nQuestion: {question}"
|
33 |
+
}
|
34 |
+
]
|
35 |
+
chat_completion = self.client.chat.completions.create(
|
36 |
+
messages=messages,
|
37 |
+
model="llama3-70b-8192"
|
38 |
+
)
|
39 |
+
|
40 |
+
return chat_completion.choices[0].message.content
|
41 |
+
|
42 |
+
|
43 |
+
def read_docs(dir_path) -> List[str]:
|
44 |
+
docs = []
|
45 |
+
for path in glob.glob(f'{dir_path}/*.txt'):
|
46 |
+
with open(path, 'r', encoding='utf-8') as file:
|
47 |
+
text = file.read()
|
48 |
+
docs.append(text)
|
49 |
+
return docs
|
requirements.txt
ADDED
Binary file (2.48 kB). View file
|
|
retriever.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from typing import List
|
5 |
+
from rank_bm25 import BM25Okapi
|
6 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
7 |
+
|
8 |
+
from chunker import chunk_documents
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
class Retriever:
|
13 |
+
def __init__(self, docs: List[str], score: int) -> None:
|
14 |
+
self.docs = chunk_documents(docs=docs)
|
15 |
+
self.score = score
|
16 |
+
tokenized_docs = [doc.lower().split(" ") for doc in self.docs]
|
17 |
+
self.bm25 = BM25Okapi(tokenized_docs)
|
18 |
+
self.sbert = SentenceTransformer(
|
19 |
+
'sentence-transformers/all-distilroberta-v1'
|
20 |
+
)
|
21 |
+
self.doc_embeddings = self.sbert.encode(
|
22 |
+
self.docs, show_progress_bar=True
|
23 |
+
)
|
24 |
+
self.cross_encoder = CrossEncoder("cross-encoder/stsb-roberta-base")
|
25 |
+
|
26 |
+
|
27 |
+
def get_docs(self, query: str, n: int = 5, score: int = 2) -> List[str]:
|
28 |
+
match score:
|
29 |
+
case 0:
|
30 |
+
bm25_scores = self._get_bm25_scores(query=query)
|
31 |
+
sorted_indices = torch.Tensor.tolist(
|
32 |
+
np.argsort(bm25_scores)
|
33 |
+
)[::-1]
|
34 |
+
case 1:
|
35 |
+
semantic_scores = self._get_semantic_scores(query=query)
|
36 |
+
sorted_indices = torch.Tensor.tolist(
|
37 |
+
np.argsort(semantic_scores)
|
38 |
+
)[::-1]
|
39 |
+
case 2:
|
40 |
+
bm25_scores = self._get_bm25_scores(query=query)
|
41 |
+
semantic_scores = self._get_semantic_scores(query=query)
|
42 |
+
scores = torch.tensor(0.3 * bm25_scores) + 0.7 * semantic_scores
|
43 |
+
sorted_indices = torch.Tensor.tolist(np.argsort(scores))[::-1]
|
44 |
+
|
45 |
+
preselected_docs = [self.docs[i] for i in sorted_indices][:n]
|
46 |
+
result = self.rerank(query=query, docs=preselected_docs)
|
47 |
+
|
48 |
+
return result
|
49 |
+
|
50 |
+
def _get_bm25_scores(self, query: str) -> np.ndarray[float]:
|
51 |
+
tokenized_query = query.lower().split(" ")
|
52 |
+
bm25_scores = self.bm25.get_scores(tokenized_query)
|
53 |
+
|
54 |
+
return bm25_scores
|
55 |
+
|
56 |
+
def _get_semantic_scores(self, query: str) -> torch.Tensor:
|
57 |
+
query_embeddings = self.sbert.encode(query)
|
58 |
+
semantic_scores = self.sbert.similarity(
|
59 |
+
query_embeddings, self.doc_embeddings
|
60 |
+
)
|
61 |
+
|
62 |
+
return semantic_scores[0]
|
63 |
+
|
64 |
+
def rerank(self, query: str, docs: List[str]) -> List[str]:
|
65 |
+
pairs = [(query, doc) for doc in docs]
|
66 |
+
rerank_scores = self.cross_encoder.predict(pairs)
|
67 |
+
reranked_docs = [doc for _, doc in sorted(zip(rerank_scores, docs), reverse=True)]
|
68 |
+
|
69 |
+
return reranked_docs
|