ugaray96 commited on
Commit
5692cb3
·
unverified ·
1 Parent(s): f99d6db

Enhance audio processing and search functionality

Browse files

- Add NLTK downloads for text processing
- Update text2speech import and pipeline configuration
- Improve audio file handling and path management
- Refactor search result processing and audio playback
- Update caching decorators and utility functions
- Modify requirements to include necessary dependencies

Signed-off-by: Unai Garay <[email protected]>

app.py CHANGED
@@ -8,11 +8,16 @@ st.set_page_config(
8
  menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
9
  )
10
 
 
11
  from streamlit_option_menu import option_menu
12
- from interface.config import session_state_variables, pages
13
  from interface.components import component_select_pipeline
 
14
  from interface.utils import load_audio_model
15
 
 
 
 
16
  # Initialization of session state
17
  for key, value in session_state_variables.items():
18
  if key not in st.session_state:
 
8
  menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
9
  )
10
 
11
+ import nltk
12
  from streamlit_option_menu import option_menu
13
+
14
  from interface.components import component_select_pipeline
15
+ from interface.config import pages, session_state_variables
16
  from interface.utils import load_audio_model
17
 
18
+ nltk.download("punkt_tab")
19
+ nltk.download("averaged_perceptron_tagger_eng")
20
+
21
  # Initialization of session state
22
  for key, value in session_state_variables.items():
23
  if key not in st.session_state:
core/pipelines.py CHANGED
@@ -2,17 +2,22 @@
2
  Haystack Pipelines
3
  """
4
 
 
5
  from pathlib import Path
 
6
  from haystack import Pipeline
7
  from haystack.document_stores import InMemoryDocumentStore
8
- from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
9
  from haystack.nodes.preprocessor import PreProcessor
10
  from haystack.nodes.ranker import SentenceTransformersRanker
11
- from haystack.nodes.audio.document_to_speech import DocumentToSpeech
12
- import os
13
 
14
  data_path = "data/"
 
15
  os.makedirs(data_path, exist_ok=True)
 
 
 
16
 
17
  index = "documents"
18
 
@@ -59,7 +64,7 @@ def keyword_search(
59
  if audio_output:
60
  doc2speech = DocumentToSpeech(
61
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
62
- generated_audio_dir=Path(data_path + "audio"),
63
  )
64
  search_pipeline.add_node(
65
  doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
@@ -114,12 +119,12 @@ def dense_passage_retrieval(
114
  )
115
 
116
  if audio_output:
117
- doc2speech = DocumentToSpeech(
118
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
119
- generated_audio_dir=Path(data_path + "audio"),
120
  )
121
  search_pipeline.add_node(
122
- doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
123
  )
124
 
125
  return search_pipeline, index_pipeline
@@ -155,10 +160,12 @@ def dense_passage_retrieval_ranker(
155
  search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
156
 
157
  if audio_output:
158
- doc2speech = DocumentToSpeech(
159
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
160
- generated_audio_dir=Path(data_path + "audio"),
 
 
 
161
  )
162
- search_pipeline.add_node(doc2speech, name="DocumentToSpeech", inputs=["Ranker"])
163
 
164
  return search_pipeline, index_pipeline
 
2
  Haystack Pipelines
3
  """
4
 
5
+ import os
6
  from pathlib import Path
7
+
8
  from haystack import Pipeline
9
  from haystack.document_stores import InMemoryDocumentStore
 
10
  from haystack.nodes.preprocessor import PreProcessor
11
  from haystack.nodes.ranker import SentenceTransformersRanker
12
+ from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
13
+ from text2speech import DocumentToSpeech
14
 
15
  data_path = "data/"
16
+ audio_path = os.path.join(data_path, "audio")
17
  os.makedirs(data_path, exist_ok=True)
18
+ os.makedirs(audio_path, exist_ok=True)
19
+ # Ensure proper permissions
20
+ os.chmod(audio_path, 0o777)
21
 
22
  index = "documents"
23
 
 
64
  if audio_output:
65
  doc2speech = DocumentToSpeech(
66
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
67
+ generated_audio_dir=Path(audio_path),
68
  )
69
  search_pipeline.add_node(
70
  doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
 
119
  )
120
 
121
  if audio_output:
122
+ document_to_speech = DocumentToSpeech(
123
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
124
+ generated_audio_dir=Path(audio_path),
125
  )
126
  search_pipeline.add_node(
127
+ document_to_speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
128
  )
129
 
130
  return search_pipeline, index_pipeline
 
160
  search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
161
 
162
  if audio_output:
163
+ document_to_speech = DocumentToSpeech(
164
  model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
165
+ generated_audio_dir=Path(audio_path),
166
+ )
167
+ search_pipeline.add_node(
168
+ document_to_speech, name="DocumentToSpeech", inputs=["Ranker"]
169
  )
 
170
 
171
  return search_pipeline, index_pipeline
core/search_index.py CHANGED
@@ -1,7 +1,8 @@
1
- from haystack.schema import Document
2
- from haystack.document_stores import BaseDocumentStore
3
  import uuid
4
 
 
 
 
5
 
6
  def format_docs(documents):
7
  """Given a list of documents, format the documents and return the documents and doc ids."""
@@ -37,16 +38,23 @@ def search(queries, pipeline):
37
  for res in matches:
38
  if not score_is_empty:
39
  score_is_empty = True if res.score is None else False
 
 
 
 
 
 
40
  match = {
41
- "text": res.content,
42
  "id": res.meta["id"],
43
  "fragment_id": res.id,
44
  "meta": res.meta,
45
  }
46
  if not score_is_empty:
47
  match.update({"score": res.score})
48
- if hasattr(res, "content_audio"):
49
- match.update({"content_audio": res.content_audio})
 
50
  query_results.append(match)
51
  if not score_is_empty:
52
  query_results = sorted(
 
 
 
1
  import uuid
2
 
3
+ from haystack.document_stores import BaseDocumentStore
4
+ from haystack.schema import Document
5
+
6
 
7
  def format_docs(documents):
8
  """Given a list of documents, format the documents and return the documents and doc ids."""
 
38
  for res in matches:
39
  if not score_is_empty:
40
  score_is_empty = True if res.score is None else False
41
+
42
+ # Get the original text from content or meta
43
+ original_text = res.content
44
+ if hasattr(res, "meta") and "content_text" in res.meta:
45
+ original_text = res.meta["content_text"]
46
+
47
  match = {
48
+ "text": original_text,
49
  "id": res.meta["id"],
50
  "fragment_id": res.id,
51
  "meta": res.meta,
52
  }
53
  if not score_is_empty:
54
  match.update({"score": res.score})
55
+ if res.content_type == "audio":
56
+ # Add audio path from the content field
57
+ match.update({"content_audio": res.content})
58
  query_results.append(match)
59
  if not score_is_empty:
60
  query_results = sorted(
interface/components.py CHANGED
@@ -25,16 +25,35 @@ def component_select_pipeline(container):
25
  index_pipe = pipeline_names.index(selected_pipeline)
26
  st.write("---")
27
  st.header("Pipeline Parameters")
 
 
 
28
  for parameter, value in pipeline_func_parameters[index_pipe].items():
29
- if isinstance(value, str):
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  value = st.text_input(parameter, value)
31
  elif isinstance(value, bool):
32
  value = st.checkbox(parameter, value)
33
  elif isinstance(value, int):
 
 
34
  value = int(st.number_input(parameter, value=value))
35
  elif isinstance(value, float):
36
  value = float(st.number_input(parameter, value=value))
37
  pipeline_func_parameters[index_pipe][parameter] = value
 
38
  if (
39
  st.session_state["pipeline"] is None
40
  or st.session_state["pipeline"]["name"] != selected_pipeline
@@ -93,12 +112,18 @@ def component_show_search_result(container, results):
93
  st.markdown(f"### Match {idx+1}")
94
  st.markdown(f"**Text**: {document['text']}")
95
  st.markdown(f"**Document**: {document['id']}")
 
96
  if "_split_id" in document["meta"]:
97
  st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
98
  if "score" in document:
99
  st.markdown(f"**Score**: {document['score']:.3f}")
100
  if "content_audio" in document:
101
- st.audio(str(document["content_audio"]))
 
 
 
 
 
102
  st.markdown("---")
103
 
104
 
 
25
  index_pipe = pipeline_names.index(selected_pipeline)
26
  st.write("---")
27
  st.header("Pipeline Parameters")
28
+
29
+ # Process audio_output first to ensure top_k is set correctly
30
+ audio_output_value = False
31
  for parameter, value in pipeline_func_parameters[index_pipe].items():
32
+ if parameter == "audio_output":
33
+ audio_output_value = st.checkbox(parameter, value)
34
+ pipeline_func_parameters[index_pipe][
35
+ "audio_output"
36
+ ] = audio_output_value
37
+ if audio_output_value:
38
+ pipeline_func_parameters[index_pipe]["top_k"] = 3
39
+ break
40
+
41
+ # Then process all other parameters
42
+ for parameter, value in pipeline_func_parameters[index_pipe].items():
43
+ if parameter == "audio_output":
44
+ continue
45
+ elif isinstance(value, str):
46
  value = st.text_input(parameter, value)
47
  elif isinstance(value, bool):
48
  value = st.checkbox(parameter, value)
49
  elif isinstance(value, int):
50
+ if parameter == "top_k" and audio_output_value:
51
+ value = 3
52
  value = int(st.number_input(parameter, value=value))
53
  elif isinstance(value, float):
54
  value = float(st.number_input(parameter, value=value))
55
  pipeline_func_parameters[index_pipe][parameter] = value
56
+
57
  if (
58
  st.session_state["pipeline"] is None
59
  or st.session_state["pipeline"]["name"] != selected_pipeline
 
112
  st.markdown(f"### Match {idx+1}")
113
  st.markdown(f"**Text**: {document['text']}")
114
  st.markdown(f"**Document**: {document['id']}")
115
+ st.json(document)
116
  if "_split_id" in document["meta"]:
117
  st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
118
  if "score" in document:
119
  st.markdown(f"**Score**: {document['score']:.3f}")
120
  if "content_audio" in document:
121
+ try:
122
+ with open(document["content_audio"], "rb") as audio_file:
123
+ audio_bytes = audio_file.read()
124
+ st.audio(audio_bytes, format="audio/wav")
125
+ except Exception as e:
126
+ st.error(f"Error loading audio: {str(e)}")
127
  st.markdown("---")
128
 
129
 
interface/utils.py CHANGED
@@ -1,16 +1,18 @@
1
- from io import StringIO
2
  import os
3
  import shutil
4
- import core.pipelines as pipelines_functions
5
- from core.pipelines import data_path
6
- from core.audio import audio_to_text, load_model
7
  from inspect import getmembers, isfunction, signature
8
- from newspaper import Article
9
- from PyPDF2 import PdfFileReader
10
- import streamlit as st
11
  import pandas as pd
12
  import pytesseract
 
 
13
  from PIL import Image
 
 
 
 
 
14
 
15
 
16
  def get_pipelines():
@@ -35,7 +37,7 @@ def reset_vars_data():
35
  os.makedirs(data_path, exist_ok=True)
36
 
37
 
38
- @st.experimental_memo
39
  def extract_text_from_url(url: str):
40
  article = Article(url)
41
  article.download()
@@ -44,7 +46,7 @@ def extract_text_from_url(url: str):
44
  return article.text
45
 
46
 
47
- @st.experimental_memo
48
  def extract_text_from_file(file):
49
  # read text file
50
  if file.type == "text/plain":
@@ -110,6 +112,6 @@ def extract_text_from_file(file):
110
  return None
111
 
112
 
113
- @st.experimental_singleton
114
  def load_audio_model():
115
  return load_model()
 
 
1
  import os
2
  import shutil
 
 
 
3
  from inspect import getmembers, isfunction, signature
4
+ from io import StringIO
5
+
 
6
  import pandas as pd
7
  import pytesseract
8
+ import streamlit as st
9
+ from newspaper import Article
10
  from PIL import Image
11
+ from PyPDF2 import PdfFileReader
12
+
13
+ import core.pipelines as pipelines_functions
14
+ from core.audio import audio_to_text, load_model
15
+ from core.pipelines import data_path
16
 
17
 
18
  def get_pipelines():
 
37
  os.makedirs(data_path, exist_ok=True)
38
 
39
 
40
+ @st.cache_data
41
  def extract_text_from_url(url: str):
42
  article = Article(url)
43
  article.download()
 
46
  return article.text
47
 
48
 
49
+ @st.cache_data
50
  def extract_text_from_file(file):
51
  # read text file
52
  if file.type == "text/plain":
 
112
  return None
113
 
114
 
115
+ @st.cache_resource
116
  def load_audio_model():
117
  return load_model()
requirements.txt CHANGED
@@ -1,14 +1,15 @@
1
  streamlit==1.40.1
2
- streamlit_option_menu==0.4.0
3
- farm-haystack==1.26.4
4
  black==24.8.0
5
  plotly==5.24.1
6
  newspaper3k==0.2.8
7
  PyPDF2==3.0.1
8
  pytesseract==0.3.13
9
  soundfile==0.13.1
10
- espnet==202412
11
  pydub==0.25.1
12
  espnet_model_zoo==0.1.7
13
  openai-whisper==20240930
14
- altair==5.4.1
 
 
 
1
  streamlit==1.40.1
2
+ farm-haystack[inference]==1.26.4
 
3
  black==24.8.0
4
  plotly==5.24.1
5
  newspaper3k==0.2.8
6
  PyPDF2==3.0.1
7
  pytesseract==0.3.13
8
  soundfile==0.13.1
9
+ espnet==202304
10
  pydub==0.25.1
11
  espnet_model_zoo==0.1.7
12
  openai-whisper==20240930
13
+ farm-haystack-text2speech==1.1.1
14
+ altair==5.4.1
15
+ lxml_html_clean==0.4.1