Spaces:
Build error
Build error
Upload 9 files
Browse files- .gitattributes +35 -35
- .gitignore +160 -0
- .streamlit/config.toml +5 -0
- README.md +9 -12
- app.py +133 -57
- assets/logo.png +0 -0
- assets/qasports-logo.png +0 -0
- requirements.txt +11 -3
- utils.py +28 -0
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#afbac2"
|
3 |
+
backgroundColor="#3d4850"
|
4 |
+
secondaryBackgroundColor="#081310"
|
5 |
+
textColor="#f5eff8"
|
README.md
CHANGED
@@ -1,12 +1,9 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
This repository presents a collection of files to create a question answering website, using a sports dataset called "QASports". To be able to run it on your machine, you need to follow the following steps:
|
2 |
+
|
3 |
+
1st open git bash and git clone the repository
|
4 |
+
|
5 |
+
2nd, still in the terminal, download the necessary libraries using the command "pip install -r requirements.txt"
|
6 |
+
|
7 |
+
3rd, enter the folder where the repository is and execute the command "streamlit run app.py"
|
8 |
+
|
9 |
+
By following these steps you will be able to run this website on your machine.
|
|
|
|
|
|
app.py
CHANGED
@@ -1,57 +1,133 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from
|
3 |
-
from haystack
|
4 |
-
from haystack.
|
5 |
-
import
|
6 |
-
from haystack.
|
7 |
-
|
8 |
-
from
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from datasets import load_dataset
|
3 |
+
from haystack import Pipeline
|
4 |
+
from haystack.components.readers import ExtractiveReader
|
5 |
+
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
|
6 |
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
7 |
+
|
8 |
+
from utils import get_unique_docs
|
9 |
+
|
10 |
+
|
11 |
+
# Load the dataset
|
12 |
+
@st.cache_data(show_spinner=False)
|
13 |
+
def load_documents():
|
14 |
+
"""
|
15 |
+
Load the documents from the dataset considering only unique documents.
|
16 |
+
Returns:
|
17 |
+
- documents: list of dictionaries with the documents.
|
18 |
+
"""
|
19 |
+
unique_docs = set()
|
20 |
+
dataset_name = "PedroCJardim/QASports"
|
21 |
+
dataset_split = "basketball"
|
22 |
+
st.caption(f'Fetching "{dataset_name}" dataset')
|
23 |
+
# build the dataset
|
24 |
+
dataset = load_dataset(dataset_name, dataset_split)
|
25 |
+
docs_validation = get_unique_docs(dataset["validation"], unique_docs)
|
26 |
+
docs_train = get_unique_docs(dataset["train"], unique_docs)
|
27 |
+
docs_test = get_unique_docs(dataset["test"], unique_docs)
|
28 |
+
documents = docs_validation + docs_train + docs_test
|
29 |
+
return documents
|
30 |
+
|
31 |
+
|
32 |
+
@st.cache_resource(show_spinner=False)
|
33 |
+
def get_document_store(documents):
|
34 |
+
"""
|
35 |
+
Index the files in the document store.
|
36 |
+
Args:
|
37 |
+
- files: list of dictionaries with the documents.
|
38 |
+
"""
|
39 |
+
# Create in memory database
|
40 |
+
st.caption(f"Building the Document Store")
|
41 |
+
document_store = InMemoryDocumentStore()
|
42 |
+
document_store.write_documents(documents=documents)
|
43 |
+
return document_store
|
44 |
+
|
45 |
+
|
46 |
+
@st.cache_resource(show_spinner=False)
|
47 |
+
def get_question_pipeline(_doc_store):
|
48 |
+
"""
|
49 |
+
Create the pipeline with the retriever and reader components.
|
50 |
+
Args:
|
51 |
+
- doc_store: instance of the document store.
|
52 |
+
Returns:
|
53 |
+
- pipe: instance of the pipeline.
|
54 |
+
"""
|
55 |
+
st.caption(f"Building the Question Answering pipeline")
|
56 |
+
# Create the retriever and reader
|
57 |
+
retriever = InMemoryBM25Retriever(document_store=_doc_store)
|
58 |
+
reader = ExtractiveReader(model="deepset/roberta-base-squad2")
|
59 |
+
reader.warm_up()
|
60 |
+
# Create the pipeline
|
61 |
+
pipe = Pipeline()
|
62 |
+
pipe.add_component(instance=retriever, name="retriever")
|
63 |
+
pipe.add_component(instance=reader, name="reader")
|
64 |
+
pipe.connect("retriever.documents", "reader.documents")
|
65 |
+
return pipe
|
66 |
+
|
67 |
+
|
68 |
+
def search(pipeline, question: str):
|
69 |
+
"""
|
70 |
+
Search for the answer to a question in the documents.
|
71 |
+
Args:
|
72 |
+
- pipeline: instance of the pipeline.
|
73 |
+
- question: string with the question.
|
74 |
+
Returns:
|
75 |
+
- answer: dictionary with the answer.
|
76 |
+
"""
|
77 |
+
# Get the answers
|
78 |
+
top_k = 3
|
79 |
+
answer = pipeline.run(
|
80 |
+
data={
|
81 |
+
"retriever": {"query": question, "top_k": 10},
|
82 |
+
"reader": {"query": question, "top_k": top_k},
|
83 |
+
}
|
84 |
+
)
|
85 |
+
max_k = min(top_k, len(answer["reader"]["answers"]))
|
86 |
+
return answer["reader"]["answers"][0:max_k]
|
87 |
+
|
88 |
+
|
89 |
+
# Streamlit interface
|
90 |
+
_, centering_column, _ = st.columns(3)
|
91 |
+
with centering_column:
|
92 |
+
st.image("assets/qasports-logo.png", use_column_width=True)
|
93 |
+
|
94 |
+
# Loading status
|
95 |
+
with st.status(
|
96 |
+
"Downloading dataset...", expanded=st.session_state.get("expanded", True)
|
97 |
+
) as status:
|
98 |
+
documents = load_documents()
|
99 |
+
status.update(label="Indexing documents...")
|
100 |
+
doc_store = get_document_store(documents)
|
101 |
+
status.update(label="Creating pipeline...")
|
102 |
+
pipe = get_question_pipeline(doc_store)
|
103 |
+
status.update(
|
104 |
+
label="Download and indexing complete!", state="complete", expanded=False
|
105 |
+
)
|
106 |
+
st.session_state["expanded"] = False
|
107 |
+
|
108 |
+
st.subheader("π Basketball", divider="rainbow")
|
109 |
+
st.caption(
|
110 |
+
"""This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources."""
|
111 |
+
)
|
112 |
+
|
113 |
+
if user_query := st.text_input(
|
114 |
+
label="Ask a question about Basketball! π",
|
115 |
+
placeholder="How many field goals did Kobe Bryant score?",
|
116 |
+
):
|
117 |
+
# Get the answers
|
118 |
+
with st.spinner("Waiting"):
|
119 |
+
try:
|
120 |
+
answer = search(pipe, user_query)
|
121 |
+
for idx, ans in enumerate(answer):
|
122 |
+
st.info(
|
123 |
+
f"""
|
124 |
+
Answer {idx+1}: "{ans.data}" | Score: {ans.score:0.4f}
|
125 |
+
Document: "{ans.document.meta["title"]}"
|
126 |
+
URL: {ans.document.meta["url"]}
|
127 |
+
"""
|
128 |
+
)
|
129 |
+
with st.expander("See details", expanded=False):
|
130 |
+
st.write(ans)
|
131 |
+
st.divider()
|
132 |
+
except Exception as e:
|
133 |
+
st.error("We do not have an answer for your question")
|
assets/logo.png
ADDED
![]() |
assets/qasports-logo.png
ADDED
![]() |
requirements.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main
|
2 |
+
# streamlit # commented to avoid miss instalation
|
3 |
+
|
4 |
+
# HuggingFace
|
5 |
+
datasets==2.18.0
|
6 |
+
# Question Answering
|
7 |
+
haystack-ai==2.0.1
|
8 |
+
accelerate==0.29.2
|
9 |
+
sentence-transformers==2.7.0
|
10 |
+
# Extra
|
11 |
+
mmh3==4.1.0
|
utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This module contains utility functions for the project"""
|
2 |
+
|
3 |
+
import mmh3
|
4 |
+
from haystack import Document
|
5 |
+
|
6 |
+
|
7 |
+
def get_unique_docs(dataset, unique_docs: set):
|
8 |
+
"""Get unique documents from dataset
|
9 |
+
Args:
|
10 |
+
dataset: list of dictionaries
|
11 |
+
Returns:
|
12 |
+
docs: list of haystack.Document
|
13 |
+
"""
|
14 |
+
docs = list()
|
15 |
+
for doc in dataset:
|
16 |
+
if doc["context"] is not None and doc["context_id"] not in unique_docs:
|
17 |
+
unique_docs.add(doc["context_id"])
|
18 |
+
document = Document(
|
19 |
+
content=doc["context"],
|
20 |
+
meta={
|
21 |
+
"title": doc["context_title"],
|
22 |
+
"context_id": doc["context_id"],
|
23 |
+
"url": doc["url"],
|
24 |
+
"source": "QASports",
|
25 |
+
},
|
26 |
+
)
|
27 |
+
docs.append(document)
|
28 |
+
return docs
|