Spaces:

ugaray96
/

neural-search

Sleeping

App Files Files Community

ugaray96 commited on Jan 27

Commit

5692cb3

unverified ·

1 Parent(s): f99d6db

Enhance audio processing and search functionality

Browse files

- Add NLTK downloads for text processing
- Update text2speech import and pipeline configuration
- Improve audio file handling and path management
- Refactor search result processing and audio playback
- Update caching decorators and utility functions
- Modify requirements to include necessary dependencies

Signed-off-by: Unai Garay <[email protected]>

Files changed (6) hide show

app.py +6 -1
core/pipelines.py +17 -10
core/search_index.py +13 -5
interface/components.py +27 -2
interface/utils.py +12 -10
requirements.txt +5 -4

app.py CHANGED Viewed

@@ -8,11 +8,16 @@ st.set_page_config(
     menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
 )
 from streamlit_option_menu import option_menu
-from interface.config import session_state_variables, pages
 from interface.components import component_select_pipeline
 from interface.utils import load_audio_model
 # Initialization of session state
 for key, value in session_state_variables.items():
     if key not in st.session_state:

     menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
 )
+import nltk
 from streamlit_option_menu import option_menu
 from interface.components import component_select_pipeline
+from interface.config import pages, session_state_variables
 from interface.utils import load_audio_model
+nltk.download("punkt_tab")
+nltk.download("averaged_perceptron_tagger_eng")
 # Initialization of session state
 for key, value in session_state_variables.items():
     if key not in st.session_state:

core/pipelines.py CHANGED Viewed

@@ -2,17 +2,22 @@
 Haystack Pipelines
 """
 from pathlib import Path
 from haystack import Pipeline
 from haystack.document_stores import InMemoryDocumentStore
-from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
 from haystack.nodes.preprocessor import PreProcessor
 from haystack.nodes.ranker import SentenceTransformersRanker
-from haystack.nodes.audio.document_to_speech import DocumentToSpeech
-import os
 data_path = "data/"
 os.makedirs(data_path, exist_ok=True)
 index = "documents"
@@ -59,7 +64,7 @@ def keyword_search(
     if audio_output:
         doc2speech = DocumentToSpeech(
             model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
-            generated_audio_dir=Path(data_path + "audio"),
         )
         search_pipeline.add_node(
             doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
@@ -114,12 +119,12 @@ def dense_passage_retrieval(
     )
     if audio_output:
-        doc2speech = DocumentToSpeech(
             model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
-            generated_audio_dir=Path(data_path + "audio"),
         )
         search_pipeline.add_node(
-            doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
         )
     return search_pipeline, index_pipeline
@@ -155,10 +160,12 @@ def dense_passage_retrieval_ranker(
     search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
     if audio_output:
-        doc2speech = DocumentToSpeech(
             model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
-            generated_audio_dir=Path(data_path + "audio"),
         )
-        search_pipeline.add_node(doc2speech, name="DocumentToSpeech", inputs=["Ranker"])
     return search_pipeline, index_pipeline

 Haystack Pipelines
 """
+import os
 from pathlib import Path
 from haystack import Pipeline
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.nodes.preprocessor import PreProcessor
 from haystack.nodes.ranker import SentenceTransformersRanker
+from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
+from text2speech import DocumentToSpeech
 data_path = "data/"
+audio_path = os.path.join(data_path, "audio")
 os.makedirs(data_path, exist_ok=True)
+os.makedirs(audio_path, exist_ok=True)
+# Ensure proper permissions
+os.chmod(audio_path, 0o777)
 index = "documents"
     if audio_output:
         doc2speech = DocumentToSpeech(
             model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
+            generated_audio_dir=Path(audio_path),
         )
         search_pipeline.add_node(
             doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
     )
     if audio_output:
+        document_to_speech = DocumentToSpeech(
             model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
+            generated_audio_dir=Path(audio_path),
         )
         search_pipeline.add_node(
+            document_to_speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
         )
     return search_pipeline, index_pipeline
     search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
     if audio_output:
+        document_to_speech = DocumentToSpeech(
             model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
+            generated_audio_dir=Path(audio_path),
+        )
+        search_pipeline.add_node(
+            document_to_speech, name="DocumentToSpeech", inputs=["Ranker"]
         )
     return search_pipeline, index_pipeline

core/search_index.py CHANGED Viewed

@@ -1,7 +1,8 @@
-from haystack.schema import Document
-from haystack.document_stores import BaseDocumentStore
 import uuid
 def format_docs(documents):
     """Given a list of documents, format the documents and return the documents and doc ids."""
@@ -37,16 +38,23 @@ def search(queries, pipeline):
         for res in matches:
             if not score_is_empty:
                 score_is_empty = True if res.score is None else False
             match = {
-                "text": res.content,
                 "id": res.meta["id"],
                 "fragment_id": res.id,
                 "meta": res.meta,
             }
             if not score_is_empty:
                 match.update({"score": res.score})
-            if hasattr(res, "content_audio"):
-                match.update({"content_audio": res.content_audio})
             query_results.append(match)
         if not score_is_empty:
             query_results = sorted(

 import uuid
+from haystack.document_stores import BaseDocumentStore
+from haystack.schema import Document
 def format_docs(documents):
     """Given a list of documents, format the documents and return the documents and doc ids."""
         for res in matches:
             if not score_is_empty:
                 score_is_empty = True if res.score is None else False
+            # Get the original text from content or meta
+            original_text = res.content
+            if hasattr(res, "meta") and "content_text" in res.meta:
+                original_text = res.meta["content_text"]
             match = {
+                "text": original_text,
                 "id": res.meta["id"],
                 "fragment_id": res.id,
                 "meta": res.meta,
             }
             if not score_is_empty:
                 match.update({"score": res.score})
+            if res.content_type == "audio":
+                # Add audio path from the content field
+                match.update({"content_audio": res.content})
             query_results.append(match)
         if not score_is_empty:
             query_results = sorted(

interface/components.py CHANGED Viewed

@@ -25,16 +25,35 @@ def component_select_pipeline(container):
             index_pipe = pipeline_names.index(selected_pipeline)
             st.write("---")
             st.header("Pipeline Parameters")
             for parameter, value in pipeline_func_parameters[index_pipe].items():
-                if isinstance(value, str):
                     value = st.text_input(parameter, value)
                 elif isinstance(value, bool):
                     value = st.checkbox(parameter, value)
                 elif isinstance(value, int):
                     value = int(st.number_input(parameter, value=value))
                 elif isinstance(value, float):
                     value = float(st.number_input(parameter, value=value))
                 pipeline_func_parameters[index_pipe][parameter] = value
             if (
                 st.session_state["pipeline"] is None
                 or st.session_state["pipeline"]["name"] != selected_pipeline
@@ -93,12 +112,18 @@ def component_show_search_result(container, results):
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
             if "_split_id" in document["meta"]:
                 st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
             if "score" in document:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             if "content_audio" in document:
-                st.audio(str(document["content_audio"]))
             st.markdown("---")

             index_pipe = pipeline_names.index(selected_pipeline)
             st.write("---")
             st.header("Pipeline Parameters")
+            # Process audio_output first to ensure top_k is set correctly
+            audio_output_value = False
             for parameter, value in pipeline_func_parameters[index_pipe].items():
+                if parameter == "audio_output":
+                    audio_output_value = st.checkbox(parameter, value)
+                    pipeline_func_parameters[index_pipe][
+                        "audio_output"
+                    ] = audio_output_value
+                    if audio_output_value:
+                        pipeline_func_parameters[index_pipe]["top_k"] = 3
+                    break
+            # Then process all other parameters
+            for parameter, value in pipeline_func_parameters[index_pipe].items():
+                if parameter == "audio_output":
+                    continue
+                elif isinstance(value, str):
                     value = st.text_input(parameter, value)
                 elif isinstance(value, bool):
                     value = st.checkbox(parameter, value)
                 elif isinstance(value, int):
+                    if parameter == "top_k" and audio_output_value:
+                        value = 3
                     value = int(st.number_input(parameter, value=value))
                 elif isinstance(value, float):
                     value = float(st.number_input(parameter, value=value))
                 pipeline_func_parameters[index_pipe][parameter] = value
             if (
                 st.session_state["pipeline"] is None
                 or st.session_state["pipeline"]["name"] != selected_pipeline
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
+            st.json(document)
             if "_split_id" in document["meta"]:
                 st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
             if "score" in document:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             if "content_audio" in document:
+                try:
+                    with open(document["content_audio"], "rb") as audio_file:
+                        audio_bytes = audio_file.read()
+                        st.audio(audio_bytes, format="audio/wav")
+                except Exception as e:
+                    st.error(f"Error loading audio: {str(e)}")
             st.markdown("---")

interface/utils.py CHANGED Viewed

@@ -1,16 +1,18 @@
-from io import StringIO
 import os
 import shutil
-import core.pipelines as pipelines_functions
-from core.pipelines import data_path
-from core.audio import audio_to_text, load_model
 from inspect import getmembers, isfunction, signature
-from newspaper import Article
-from PyPDF2 import PdfFileReader
-import streamlit as st
 import pandas as pd
 import pytesseract
 from PIL import Image
 def get_pipelines():
@@ -35,7 +37,7 @@ def reset_vars_data():
     os.makedirs(data_path, exist_ok=True)
-@st.experimental_memo
 def extract_text_from_url(url: str):
     article = Article(url)
     article.download()
@@ -44,7 +46,7 @@ def extract_text_from_url(url: str):
     return article.text
-@st.experimental_memo
 def extract_text_from_file(file):
     # read text file
     if file.type == "text/plain":
@@ -110,6 +112,6 @@ def extract_text_from_file(file):
         return None
-@st.experimental_singleton
 def load_audio_model():
     return load_model()

 import os
 import shutil
 from inspect import getmembers, isfunction, signature
+from io import StringIO
 import pandas as pd
 import pytesseract
+import streamlit as st
+from newspaper import Article
 from PIL import Image
+from PyPDF2 import PdfFileReader
+import core.pipelines as pipelines_functions
+from core.audio import audio_to_text, load_model
+from core.pipelines import data_path
 def get_pipelines():
     os.makedirs(data_path, exist_ok=True)
+@st.cache_data
 def extract_text_from_url(url: str):
     article = Article(url)
     article.download()
     return article.text
+@st.cache_data
 def extract_text_from_file(file):
     # read text file
     if file.type == "text/plain":
         return None
+@st.cache_resource
 def load_audio_model():
     return load_model()

requirements.txt CHANGED Viewed

@@ -1,14 +1,15 @@
 streamlit==1.40.1
-streamlit_option_menu==0.4.0
-farm-haystack==1.26.4
 black==24.8.0
 plotly==5.24.1
 newspaper3k==0.2.8
 PyPDF2==3.0.1
 pytesseract==0.3.13
 soundfile==0.13.1
-espnet==202412
 pydub==0.25.1
 espnet_model_zoo==0.1.7
 openai-whisper==20240930
-altair==5.4.1

 streamlit==1.40.1
+farm-haystack[inference]==1.26.4
 black==24.8.0
 plotly==5.24.1
 newspaper3k==0.2.8
 PyPDF2==3.0.1
 pytesseract==0.3.13
 soundfile==0.13.1
+espnet==202304
 pydub==0.25.1
 espnet_model_zoo==0.1.7
 openai-whisper==20240930
+farm-haystack-text2speech==1.1.1
+altair==5.4.1
+lxml_html_clean==0.4.1