Spaces:
Running
Running
Enhance audio processing and search functionality
Browse files- Add NLTK downloads for text processing
- Update text2speech import and pipeline configuration
- Improve audio file handling and path management
- Refactor search result processing and audio playback
- Update caching decorators and utility functions
- Modify requirements to include necessary dependencies
Signed-off-by: Unai Garay <[email protected]>
- app.py +6 -1
- core/pipelines.py +17 -10
- core/search_index.py +13 -5
- interface/components.py +27 -2
- interface/utils.py +12 -10
- requirements.txt +5 -4
app.py
CHANGED
@@ -8,11 +8,16 @@ st.set_page_config(
|
|
8 |
menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
|
9 |
)
|
10 |
|
|
|
11 |
from streamlit_option_menu import option_menu
|
12 |
-
|
13 |
from interface.components import component_select_pipeline
|
|
|
14 |
from interface.utils import load_audio_model
|
15 |
|
|
|
|
|
|
|
16 |
# Initialization of session state
|
17 |
for key, value in session_state_variables.items():
|
18 |
if key not in st.session_state:
|
|
|
8 |
menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
|
9 |
)
|
10 |
|
11 |
+
import nltk
|
12 |
from streamlit_option_menu import option_menu
|
13 |
+
|
14 |
from interface.components import component_select_pipeline
|
15 |
+
from interface.config import pages, session_state_variables
|
16 |
from interface.utils import load_audio_model
|
17 |
|
18 |
+
nltk.download("punkt_tab")
|
19 |
+
nltk.download("averaged_perceptron_tagger_eng")
|
20 |
+
|
21 |
# Initialization of session state
|
22 |
for key, value in session_state_variables.items():
|
23 |
if key not in st.session_state:
|
core/pipelines.py
CHANGED
@@ -2,17 +2,22 @@
|
|
2 |
Haystack Pipelines
|
3 |
"""
|
4 |
|
|
|
5 |
from pathlib import Path
|
|
|
6 |
from haystack import Pipeline
|
7 |
from haystack.document_stores import InMemoryDocumentStore
|
8 |
-
from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
|
9 |
from haystack.nodes.preprocessor import PreProcessor
|
10 |
from haystack.nodes.ranker import SentenceTransformersRanker
|
11 |
-
from haystack.nodes.
|
12 |
-
import
|
13 |
|
14 |
data_path = "data/"
|
|
|
15 |
os.makedirs(data_path, exist_ok=True)
|
|
|
|
|
|
|
16 |
|
17 |
index = "documents"
|
18 |
|
@@ -59,7 +64,7 @@ def keyword_search(
|
|
59 |
if audio_output:
|
60 |
doc2speech = DocumentToSpeech(
|
61 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
62 |
-
generated_audio_dir=Path(
|
63 |
)
|
64 |
search_pipeline.add_node(
|
65 |
doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
|
@@ -114,12 +119,12 @@ def dense_passage_retrieval(
|
|
114 |
)
|
115 |
|
116 |
if audio_output:
|
117 |
-
|
118 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
119 |
-
generated_audio_dir=Path(
|
120 |
)
|
121 |
search_pipeline.add_node(
|
122 |
-
|
123 |
)
|
124 |
|
125 |
return search_pipeline, index_pipeline
|
@@ -155,10 +160,12 @@ def dense_passage_retrieval_ranker(
|
|
155 |
search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
|
156 |
|
157 |
if audio_output:
|
158 |
-
|
159 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
160 |
-
generated_audio_dir=Path(
|
|
|
|
|
|
|
161 |
)
|
162 |
-
search_pipeline.add_node(doc2speech, name="DocumentToSpeech", inputs=["Ranker"])
|
163 |
|
164 |
return search_pipeline, index_pipeline
|
|
|
2 |
Haystack Pipelines
|
3 |
"""
|
4 |
|
5 |
+
import os
|
6 |
from pathlib import Path
|
7 |
+
|
8 |
from haystack import Pipeline
|
9 |
from haystack.document_stores import InMemoryDocumentStore
|
|
|
10 |
from haystack.nodes.preprocessor import PreProcessor
|
11 |
from haystack.nodes.ranker import SentenceTransformersRanker
|
12 |
+
from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
|
13 |
+
from text2speech import DocumentToSpeech
|
14 |
|
15 |
data_path = "data/"
|
16 |
+
audio_path = os.path.join(data_path, "audio")
|
17 |
os.makedirs(data_path, exist_ok=True)
|
18 |
+
os.makedirs(audio_path, exist_ok=True)
|
19 |
+
# Ensure proper permissions
|
20 |
+
os.chmod(audio_path, 0o777)
|
21 |
|
22 |
index = "documents"
|
23 |
|
|
|
64 |
if audio_output:
|
65 |
doc2speech = DocumentToSpeech(
|
66 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
67 |
+
generated_audio_dir=Path(audio_path),
|
68 |
)
|
69 |
search_pipeline.add_node(
|
70 |
doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
|
|
|
119 |
)
|
120 |
|
121 |
if audio_output:
|
122 |
+
document_to_speech = DocumentToSpeech(
|
123 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
124 |
+
generated_audio_dir=Path(audio_path),
|
125 |
)
|
126 |
search_pipeline.add_node(
|
127 |
+
document_to_speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
|
128 |
)
|
129 |
|
130 |
return search_pipeline, index_pipeline
|
|
|
160 |
search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
|
161 |
|
162 |
if audio_output:
|
163 |
+
document_to_speech = DocumentToSpeech(
|
164 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
165 |
+
generated_audio_dir=Path(audio_path),
|
166 |
+
)
|
167 |
+
search_pipeline.add_node(
|
168 |
+
document_to_speech, name="DocumentToSpeech", inputs=["Ranker"]
|
169 |
)
|
|
|
170 |
|
171 |
return search_pipeline, index_pipeline
|
core/search_index.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
-
from haystack.schema import Document
|
2 |
-
from haystack.document_stores import BaseDocumentStore
|
3 |
import uuid
|
4 |
|
|
|
|
|
|
|
5 |
|
6 |
def format_docs(documents):
|
7 |
"""Given a list of documents, format the documents and return the documents and doc ids."""
|
@@ -37,16 +38,23 @@ def search(queries, pipeline):
|
|
37 |
for res in matches:
|
38 |
if not score_is_empty:
|
39 |
score_is_empty = True if res.score is None else False
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
match = {
|
41 |
-
"text":
|
42 |
"id": res.meta["id"],
|
43 |
"fragment_id": res.id,
|
44 |
"meta": res.meta,
|
45 |
}
|
46 |
if not score_is_empty:
|
47 |
match.update({"score": res.score})
|
48 |
-
if
|
49 |
-
|
|
|
50 |
query_results.append(match)
|
51 |
if not score_is_empty:
|
52 |
query_results = sorted(
|
|
|
|
|
|
|
1 |
import uuid
|
2 |
|
3 |
+
from haystack.document_stores import BaseDocumentStore
|
4 |
+
from haystack.schema import Document
|
5 |
+
|
6 |
|
7 |
def format_docs(documents):
|
8 |
"""Given a list of documents, format the documents and return the documents and doc ids."""
|
|
|
38 |
for res in matches:
|
39 |
if not score_is_empty:
|
40 |
score_is_empty = True if res.score is None else False
|
41 |
+
|
42 |
+
# Get the original text from content or meta
|
43 |
+
original_text = res.content
|
44 |
+
if hasattr(res, "meta") and "content_text" in res.meta:
|
45 |
+
original_text = res.meta["content_text"]
|
46 |
+
|
47 |
match = {
|
48 |
+
"text": original_text,
|
49 |
"id": res.meta["id"],
|
50 |
"fragment_id": res.id,
|
51 |
"meta": res.meta,
|
52 |
}
|
53 |
if not score_is_empty:
|
54 |
match.update({"score": res.score})
|
55 |
+
if res.content_type == "audio":
|
56 |
+
# Add audio path from the content field
|
57 |
+
match.update({"content_audio": res.content})
|
58 |
query_results.append(match)
|
59 |
if not score_is_empty:
|
60 |
query_results = sorted(
|
interface/components.py
CHANGED
@@ -25,16 +25,35 @@ def component_select_pipeline(container):
|
|
25 |
index_pipe = pipeline_names.index(selected_pipeline)
|
26 |
st.write("---")
|
27 |
st.header("Pipeline Parameters")
|
|
|
|
|
|
|
28 |
for parameter, value in pipeline_func_parameters[index_pipe].items():
|
29 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
value = st.text_input(parameter, value)
|
31 |
elif isinstance(value, bool):
|
32 |
value = st.checkbox(parameter, value)
|
33 |
elif isinstance(value, int):
|
|
|
|
|
34 |
value = int(st.number_input(parameter, value=value))
|
35 |
elif isinstance(value, float):
|
36 |
value = float(st.number_input(parameter, value=value))
|
37 |
pipeline_func_parameters[index_pipe][parameter] = value
|
|
|
38 |
if (
|
39 |
st.session_state["pipeline"] is None
|
40 |
or st.session_state["pipeline"]["name"] != selected_pipeline
|
@@ -93,12 +112,18 @@ def component_show_search_result(container, results):
|
|
93 |
st.markdown(f"### Match {idx+1}")
|
94 |
st.markdown(f"**Text**: {document['text']}")
|
95 |
st.markdown(f"**Document**: {document['id']}")
|
|
|
96 |
if "_split_id" in document["meta"]:
|
97 |
st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
|
98 |
if "score" in document:
|
99 |
st.markdown(f"**Score**: {document['score']:.3f}")
|
100 |
if "content_audio" in document:
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
102 |
st.markdown("---")
|
103 |
|
104 |
|
|
|
25 |
index_pipe = pipeline_names.index(selected_pipeline)
|
26 |
st.write("---")
|
27 |
st.header("Pipeline Parameters")
|
28 |
+
|
29 |
+
# Process audio_output first to ensure top_k is set correctly
|
30 |
+
audio_output_value = False
|
31 |
for parameter, value in pipeline_func_parameters[index_pipe].items():
|
32 |
+
if parameter == "audio_output":
|
33 |
+
audio_output_value = st.checkbox(parameter, value)
|
34 |
+
pipeline_func_parameters[index_pipe][
|
35 |
+
"audio_output"
|
36 |
+
] = audio_output_value
|
37 |
+
if audio_output_value:
|
38 |
+
pipeline_func_parameters[index_pipe]["top_k"] = 3
|
39 |
+
break
|
40 |
+
|
41 |
+
# Then process all other parameters
|
42 |
+
for parameter, value in pipeline_func_parameters[index_pipe].items():
|
43 |
+
if parameter == "audio_output":
|
44 |
+
continue
|
45 |
+
elif isinstance(value, str):
|
46 |
value = st.text_input(parameter, value)
|
47 |
elif isinstance(value, bool):
|
48 |
value = st.checkbox(parameter, value)
|
49 |
elif isinstance(value, int):
|
50 |
+
if parameter == "top_k" and audio_output_value:
|
51 |
+
value = 3
|
52 |
value = int(st.number_input(parameter, value=value))
|
53 |
elif isinstance(value, float):
|
54 |
value = float(st.number_input(parameter, value=value))
|
55 |
pipeline_func_parameters[index_pipe][parameter] = value
|
56 |
+
|
57 |
if (
|
58 |
st.session_state["pipeline"] is None
|
59 |
or st.session_state["pipeline"]["name"] != selected_pipeline
|
|
|
112 |
st.markdown(f"### Match {idx+1}")
|
113 |
st.markdown(f"**Text**: {document['text']}")
|
114 |
st.markdown(f"**Document**: {document['id']}")
|
115 |
+
st.json(document)
|
116 |
if "_split_id" in document["meta"]:
|
117 |
st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
|
118 |
if "score" in document:
|
119 |
st.markdown(f"**Score**: {document['score']:.3f}")
|
120 |
if "content_audio" in document:
|
121 |
+
try:
|
122 |
+
with open(document["content_audio"], "rb") as audio_file:
|
123 |
+
audio_bytes = audio_file.read()
|
124 |
+
st.audio(audio_bytes, format="audio/wav")
|
125 |
+
except Exception as e:
|
126 |
+
st.error(f"Error loading audio: {str(e)}")
|
127 |
st.markdown("---")
|
128 |
|
129 |
|
interface/utils.py
CHANGED
@@ -1,16 +1,18 @@
|
|
1 |
-
from io import StringIO
|
2 |
import os
|
3 |
import shutil
|
4 |
-
import core.pipelines as pipelines_functions
|
5 |
-
from core.pipelines import data_path
|
6 |
-
from core.audio import audio_to_text, load_model
|
7 |
from inspect import getmembers, isfunction, signature
|
8 |
-
from
|
9 |
-
|
10 |
-
import streamlit as st
|
11 |
import pandas as pd
|
12 |
import pytesseract
|
|
|
|
|
13 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
def get_pipelines():
|
@@ -35,7 +37,7 @@ def reset_vars_data():
|
|
35 |
os.makedirs(data_path, exist_ok=True)
|
36 |
|
37 |
|
38 |
-
@st.
|
39 |
def extract_text_from_url(url: str):
|
40 |
article = Article(url)
|
41 |
article.download()
|
@@ -44,7 +46,7 @@ def extract_text_from_url(url: str):
|
|
44 |
return article.text
|
45 |
|
46 |
|
47 |
-
@st.
|
48 |
def extract_text_from_file(file):
|
49 |
# read text file
|
50 |
if file.type == "text/plain":
|
@@ -110,6 +112,6 @@ def extract_text_from_file(file):
|
|
110 |
return None
|
111 |
|
112 |
|
113 |
-
@st.
|
114 |
def load_audio_model():
|
115 |
return load_model()
|
|
|
|
|
1 |
import os
|
2 |
import shutil
|
|
|
|
|
|
|
3 |
from inspect import getmembers, isfunction, signature
|
4 |
+
from io import StringIO
|
5 |
+
|
|
|
6 |
import pandas as pd
|
7 |
import pytesseract
|
8 |
+
import streamlit as st
|
9 |
+
from newspaper import Article
|
10 |
from PIL import Image
|
11 |
+
from PyPDF2 import PdfFileReader
|
12 |
+
|
13 |
+
import core.pipelines as pipelines_functions
|
14 |
+
from core.audio import audio_to_text, load_model
|
15 |
+
from core.pipelines import data_path
|
16 |
|
17 |
|
18 |
def get_pipelines():
|
|
|
37 |
os.makedirs(data_path, exist_ok=True)
|
38 |
|
39 |
|
40 |
+
@st.cache_data
|
41 |
def extract_text_from_url(url: str):
|
42 |
article = Article(url)
|
43 |
article.download()
|
|
|
46 |
return article.text
|
47 |
|
48 |
|
49 |
+
@st.cache_data
|
50 |
def extract_text_from_file(file):
|
51 |
# read text file
|
52 |
if file.type == "text/plain":
|
|
|
112 |
return None
|
113 |
|
114 |
|
115 |
+
@st.cache_resource
|
116 |
def load_audio_model():
|
117 |
return load_model()
|
requirements.txt
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
streamlit==1.40.1
|
2 |
-
|
3 |
-
farm-haystack==1.26.4
|
4 |
black==24.8.0
|
5 |
plotly==5.24.1
|
6 |
newspaper3k==0.2.8
|
7 |
PyPDF2==3.0.1
|
8 |
pytesseract==0.3.13
|
9 |
soundfile==0.13.1
|
10 |
-
espnet==
|
11 |
pydub==0.25.1
|
12 |
espnet_model_zoo==0.1.7
|
13 |
openai-whisper==20240930
|
14 |
-
|
|
|
|
|
|
1 |
streamlit==1.40.1
|
2 |
+
farm-haystack[inference]==1.26.4
|
|
|
3 |
black==24.8.0
|
4 |
plotly==5.24.1
|
5 |
newspaper3k==0.2.8
|
6 |
PyPDF2==3.0.1
|
7 |
pytesseract==0.3.13
|
8 |
soundfile==0.13.1
|
9 |
+
espnet==202304
|
10 |
pydub==0.25.1
|
11 |
espnet_model_zoo==0.1.7
|
12 |
openai-whisper==20240930
|
13 |
+
farm-haystack-text2speech==1.1.1
|
14 |
+
altair==5.4.1
|
15 |
+
lxml_html_clean==0.4.1
|