Ono-Enzo commited on
Commit
9577e45
Β·
verified Β·
1 Parent(s): 9175d1c

Upload 9 files

Browse files
Files changed (9) hide show
  1. .gitattributes +35 -35
  2. .gitignore +160 -0
  3. .streamlit/config.toml +5 -0
  4. README.md +9 -12
  5. app.py +133 -57
  6. assets/logo.png +0 -0
  7. assets/qasports-logo.png +0 -0
  8. requirements.txt +11 -3
  9. utils.py +28 -0
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
.streamlit/config.toml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#afbac2"
3
+ backgroundColor="#3d4850"
4
+ secondaryBackgroundColor="#081310"
5
+ textColor="#f5eff8"
README.md CHANGED
@@ -1,12 +1,9 @@
1
- ---
2
- title: Test Space
3
- emoji: πŸ“ˆ
4
- colorFrom: gray
5
- colorTo: gray
6
- sdk: streamlit
7
- sdk_version: 1.26.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ This repository presents a collection of files to create a question answering website, using a sports dataset called "QASports". To be able to run it on your machine, you need to follow the following steps:
2
+
3
+ 1st open git bash and git clone the repository
4
+
5
+ 2nd, still in the terminal, download the necessary libraries using the command "pip install -r requirements.txt"
6
+
7
+ 3rd, enter the folder where the repository is and execute the command "streamlit run app.py"
8
+
9
+ By following these steps you will be able to run this website on your machine.
 
 
 
app.py CHANGED
@@ -1,57 +1,133 @@
1
- import streamlit as st
2
- from PIL import Image
3
- from haystack.document_stores import InMemoryDocumentStore
4
- from haystack.utils import fetch_archive_from_http
5
- import os
6
- from haystack.pipelines.standard_pipelines import TextIndexingPipeline
7
- from haystack.nodes import BM25Retriever
8
- from transformers import pipeline
9
-
10
- # Criando o objeto documentStore
11
- document_store = InMemoryDocumentStore(use_bm25=True)
12
-
13
- # Exportando os dados necessΓ‘rios sobre o Game Of Thrones
14
- doc_dir = "data/build_your_first_question_answering_system"
15
- fetch_archive_from_http(
16
- url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip",
17
- output_dir=doc_dir,
18
- )
19
-
20
- files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
21
- indexing_pipeline = TextIndexingPipeline(document_store)
22
- indexing_pipeline.run_batch(file_paths=files_to_index)
23
-
24
- retriever = BM25Retriever(document_store=document_store)
25
-
26
- # Utilizando um pipeline da biblioteca Transformers
27
- pipe = pipeline("question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")
28
-
29
- image = Image.open('comida.jpg')
30
- st.image(image)
31
- st.text("""QASports Γ© um sistema de pergunta e respostas, o primeiro grande conjunto de dados
32
- e respotas a perguntas de vΓ‘rios domΓ­nios sobre esportes para perguntas abertas""")
33
-
34
- st.subheader('QASports',divider='rainbow')
35
-
36
- user_input = None
37
- if not user_input:
38
- user_input = st.text_input("Por favor, digite uma pergunta.")
39
-
40
- if user_input:
41
- res = retriever.retrieve(user_input, top_k=5) # Recupera os top 5 documentos relevantes
42
- if res:
43
- st.write(f"Foram encontrados {len(res)} documentos relevantes.")
44
- for document in res:
45
- prediction = pipe(question=user_input, context=document.content)
46
- context = document.content
47
- confidence = prediction["score"]
48
- answer = prediction["answer"]
49
- st.write("Pergunta:", user_input)
50
- st.write("Resposta:", answer)
51
- st.write("Confiança:", confidence)
52
- st.write("Contexto:", context)
53
- st.write("-" * 50)
54
-
55
- if st.button('Buscar Resposta'):
56
- st.write(prediction["answer"])
57
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from datasets import load_dataset
3
+ from haystack import Pipeline
4
+ from haystack.components.readers import ExtractiveReader
5
+ from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
6
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
7
+
8
+ from utils import get_unique_docs
9
+
10
+
11
+ # Load the dataset
12
+ @st.cache_data(show_spinner=False)
13
+ def load_documents():
14
+ """
15
+ Load the documents from the dataset considering only unique documents.
16
+ Returns:
17
+ - documents: list of dictionaries with the documents.
18
+ """
19
+ unique_docs = set()
20
+ dataset_name = "PedroCJardim/QASports"
21
+ dataset_split = "basketball"
22
+ st.caption(f'Fetching "{dataset_name}" dataset')
23
+ # build the dataset
24
+ dataset = load_dataset(dataset_name, dataset_split)
25
+ docs_validation = get_unique_docs(dataset["validation"], unique_docs)
26
+ docs_train = get_unique_docs(dataset["train"], unique_docs)
27
+ docs_test = get_unique_docs(dataset["test"], unique_docs)
28
+ documents = docs_validation + docs_train + docs_test
29
+ return documents
30
+
31
+
32
+ @st.cache_resource(show_spinner=False)
33
+ def get_document_store(documents):
34
+ """
35
+ Index the files in the document store.
36
+ Args:
37
+ - files: list of dictionaries with the documents.
38
+ """
39
+ # Create in memory database
40
+ st.caption(f"Building the Document Store")
41
+ document_store = InMemoryDocumentStore()
42
+ document_store.write_documents(documents=documents)
43
+ return document_store
44
+
45
+
46
+ @st.cache_resource(show_spinner=False)
47
+ def get_question_pipeline(_doc_store):
48
+ """
49
+ Create the pipeline with the retriever and reader components.
50
+ Args:
51
+ - doc_store: instance of the document store.
52
+ Returns:
53
+ - pipe: instance of the pipeline.
54
+ """
55
+ st.caption(f"Building the Question Answering pipeline")
56
+ # Create the retriever and reader
57
+ retriever = InMemoryBM25Retriever(document_store=_doc_store)
58
+ reader = ExtractiveReader(model="deepset/roberta-base-squad2")
59
+ reader.warm_up()
60
+ # Create the pipeline
61
+ pipe = Pipeline()
62
+ pipe.add_component(instance=retriever, name="retriever")
63
+ pipe.add_component(instance=reader, name="reader")
64
+ pipe.connect("retriever.documents", "reader.documents")
65
+ return pipe
66
+
67
+
68
+ def search(pipeline, question: str):
69
+ """
70
+ Search for the answer to a question in the documents.
71
+ Args:
72
+ - pipeline: instance of the pipeline.
73
+ - question: string with the question.
74
+ Returns:
75
+ - answer: dictionary with the answer.
76
+ """
77
+ # Get the answers
78
+ top_k = 3
79
+ answer = pipeline.run(
80
+ data={
81
+ "retriever": {"query": question, "top_k": 10},
82
+ "reader": {"query": question, "top_k": top_k},
83
+ }
84
+ )
85
+ max_k = min(top_k, len(answer["reader"]["answers"]))
86
+ return answer["reader"]["answers"][0:max_k]
87
+
88
+
89
+ # Streamlit interface
90
+ _, centering_column, _ = st.columns(3)
91
+ with centering_column:
92
+ st.image("assets/qasports-logo.png", use_column_width=True)
93
+
94
+ # Loading status
95
+ with st.status(
96
+ "Downloading dataset...", expanded=st.session_state.get("expanded", True)
97
+ ) as status:
98
+ documents = load_documents()
99
+ status.update(label="Indexing documents...")
100
+ doc_store = get_document_store(documents)
101
+ status.update(label="Creating pipeline...")
102
+ pipe = get_question_pipeline(doc_store)
103
+ status.update(
104
+ label="Download and indexing complete!", state="complete", expanded=False
105
+ )
106
+ st.session_state["expanded"] = False
107
+
108
+ st.subheader("πŸ”Ž Basketball", divider="rainbow")
109
+ st.caption(
110
+ """This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources."""
111
+ )
112
+
113
+ if user_query := st.text_input(
114
+ label="Ask a question about Basketball! πŸ€",
115
+ placeholder="How many field goals did Kobe Bryant score?",
116
+ ):
117
+ # Get the answers
118
+ with st.spinner("Waiting"):
119
+ try:
120
+ answer = search(pipe, user_query)
121
+ for idx, ans in enumerate(answer):
122
+ st.info(
123
+ f"""
124
+ Answer {idx+1}: "{ans.data}" | Score: {ans.score:0.4f}
125
+ Document: "{ans.document.meta["title"]}"
126
+ URL: {ans.document.meta["url"]}
127
+ """
128
+ )
129
+ with st.expander("See details", expanded=False):
130
+ st.write(ans)
131
+ st.divider()
132
+ except Exception as e:
133
+ st.error("We do not have an answer for your question")
assets/logo.png ADDED
assets/qasports-logo.png ADDED
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- torch
2
- transformers
3
- farm-haystack==1.6.0
 
 
 
 
 
 
 
 
 
1
+ # main
2
+ # streamlit # commented to avoid miss instalation
3
+
4
+ # HuggingFace
5
+ datasets==2.18.0
6
+ # Question Answering
7
+ haystack-ai==2.0.1
8
+ accelerate==0.29.2
9
+ sentence-transformers==2.7.0
10
+ # Extra
11
+ mmh3==4.1.0
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This module contains utility functions for the project"""
2
+
3
+ import mmh3
4
+ from haystack import Document
5
+
6
+
7
+ def get_unique_docs(dataset, unique_docs: set):
8
+ """Get unique documents from dataset
9
+ Args:
10
+ dataset: list of dictionaries
11
+ Returns:
12
+ docs: list of haystack.Document
13
+ """
14
+ docs = list()
15
+ for doc in dataset:
16
+ if doc["context"] is not None and doc["context_id"] not in unique_docs:
17
+ unique_docs.add(doc["context_id"])
18
+ document = Document(
19
+ content=doc["context"],
20
+ meta={
21
+ "title": doc["context_title"],
22
+ "context_id": doc["context_id"],
23
+ "url": doc["url"],
24
+ "source": "QASports",
25
+ },
26
+ )
27
+ docs.append(document)
28
+ return docs