Spaces:

steviel
/

ICML2025

Sleeping

App Files Files Community

stefanoviel commited on Jul 8

Commit

0fd8f7a

1 Parent(s): 1a67af9

caching again

Browse files

Files changed (1) hide show

src/streamlit_app.py +36 -26

src/streamlit_app.py CHANGED Viewed

@@ -20,7 +20,6 @@ CSV_FILE = 'papers_with_abstracts_parallel.csv'
 # --- Caching Functions ---
-# --- Caching Functions (Unchanged but crucial) ---
 @st.cache_resource
 def load_embedding_model():
     """Loads the Sentence Transformer model and caches it."""
@@ -35,56 +34,57 @@ def load_spell_checker():
 def create_and_save_embeddings(model, data_df):
     """
     Generates and saves document embeddings and the dataframe.
-    This function is called only once if the files don't exist in the persistent directory.
     """
     st.info("First time setup: Generating and saving embeddings. This may take a moment...")
-    data_df['text_to_embed'] = data_df['title'].fillna('') + ". " + data_df['abstract'].fillna('')
-    corpus_embeddings = model.encode(
-        data_df['text_to_embed'].tolist(),
-        convert_to_tensor=True,
-        show_progress_bar=True
-    )
     try:
         torch.save(corpus_embeddings, EMBEDDINGS_FILE)
         data_df.to_pickle(DATA_FILE)
-        st.success("Embeddings and data saved successfully for future sessions!")
     except Exception as e:
-        st.warning(f"Could not save embeddings to persistent storage: {e}. Will regenerate on next session.")
     return corpus_embeddings, data_df
 @st.cache_data
 def load_data_and_embeddings():
     """
-    Loads data and embeddings. It first tries to load from the persistent directory.
-    If files don't exist, it creates them. The results are cached for the current session.
     """
     model = load_embedding_model()
-    if DATA_FILE.exists() and EMBEDDINGS_FILE.exists():
         try:
-            data_df = pd.read_pickle(DATA_FILE)
             corpus_embeddings = torch.load(EMBEDDINGS_FILE)
             return model, corpus_embeddings, data_df
         except Exception as e:
-            st.warning(f"Could not load saved files: {e}. Regenerating...")
-    # Fallback to creating embeddings if they don't exist
     try:
         data_df = pd.read_csv(CSV_FILE)
         corpus_embeddings, data_df = create_and_save_embeddings(model, data_df)
     except FileNotFoundError:
-        st.error(f"The required data file '{CSV_FILE}' was not found. Please make sure it's in your repository root.")
         st.stop()
     except Exception as e:
-        st.error(f"An unexpected error occurred while loading data: {e}")
         st.stop()
     return model, corpus_embeddings, data_df
-# ... (The rest of your functions `correct_query_spelling` and `semantic_search` remain the same) ...
 def correct_query_spelling(query, spell_checker):
     """
     Corrects potential spelling mistakes in the user's query.
@@ -153,13 +153,12 @@ The search is performed by comparing the semantic meaning of your query with the
 Spelling mistakes in your query will be automatically corrected.
 """)
-# --- App Logic ---
 try:
-    # Load all necessary data using the corrected function
     model, corpus_embeddings, data_df = load_data_and_embeddings()
     spell_checker = load_spell_checker()
-    # --- User Inputs ---
     col1, col2 = st.columns([4, 1])
     with col1:
         search_query = st.text_input(
@@ -170,26 +169,37 @@ try:
         top_k_results = st.number_input(
             "Number of results",
             min_value=1,
-            max_value=100,
             value=10,
             help="Select the number of top results to display."
         )
     if search_query:
         corrected_query = correct_query_spelling(search_query, spell_checker)
         if corrected_query.lower() != search_query.lower():
             st.info(f"Did you mean: **{corrected_query}**? \n\n*Showing results for the corrected query.*")
-        search_results = semantic_search(corrected_query, model, corpus_embeddings, data_df, top_k=top_k_results)
-        st.subheader(f"Found {len(search_results)} results for '{corrected_query}'")
         if search_results:
             for result in search_results:
                 with st.container(border=True):
                     st.markdown(f"### [{result['title']}]({result['link']})")
                     st.caption(f"**Authors:** {result['authors']}")
                     if pd.notna(result['abstract']):
                         with st.expander("View Abstract"):
                             st.write(result['abstract'])
@@ -197,5 +207,5 @@ try:
             st.warning("No results found. Try a different query.")
 except Exception as e:
-    st.error(f"An error occurred during app execution: {e}")
     st.info("Please ensure all required libraries are installed and the CSV file is present in your repository.")

 # --- Caching Functions ---
 @st.cache_resource
 def load_embedding_model():
     """Loads the Sentence Transformer model and caches it."""
 def create_and_save_embeddings(model, data_df):
     """
     Generates and saves document embeddings and the dataframe.
+    This function is called only once if the files don't exist.
     """
     st.info("First time setup: Generating and saving embeddings. This may take a moment...")
+    # Combine title and abstract for richer embeddings
+    data_df['text_to_embed'] = data_df['title'] + ". " + data_df['abstract'].fillna('')
+    # Generate embeddings
+    corpus_embeddings = model.encode(data_df['text_to_embed'].tolist(), convert_to_tensor=True, show_progress_bar=True)
+    # Save embeddings and dataframe to /tmp directory
     try:
         torch.save(corpus_embeddings, EMBEDDINGS_FILE)
         data_df.to_pickle(DATA_FILE)
+        st.success("Embeddings and data saved successfully!")
     except Exception as e:
+        st.warning(f"Could not save embeddings to disk: {e}. Will regenerate on each session.")
     return corpus_embeddings, data_df
 @st.cache_data
 def load_data_and_embeddings():
     """
+    Loads the saved embeddings and dataframe from disk.
+    If files don't exist, it calls the creation function.
     """
     model = load_embedding_model()
+    # Check if files exist and are readable
+    if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(DATA_FILE):
         try:
             corpus_embeddings = torch.load(EMBEDDINGS_FILE)
+            data_df = pd.read_pickle(DATA_FILE)
             return model, corpus_embeddings, data_df
         except Exception as e:
+            st.warning(f"Could not load saved embeddings: {e}. Regenerating...")
+    st.info("embeding model path exists: " + str(Path(EMBEDDING_MODEL).exists()))
+    # Load the raw data from CSV
     try:
         data_df = pd.read_csv(CSV_FILE)
         corpus_embeddings, data_df = create_and_save_embeddings(model, data_df)
     except FileNotFoundError:
+        st.error(f"CSV file '{CSV_FILE}' not found. Please ensure it's in your repository.")
         st.stop()
     except Exception as e:
+        st.error(f"Error loading data: {e}")
         st.stop()
     return model, corpus_embeddings, data_df
 def correct_query_spelling(query, spell_checker):
     """
     Corrects potential spelling mistakes in the user's query.
 Spelling mistakes in your query will be automatically corrected.
 """)
+# Load all necessary data
 try:
     model, corpus_embeddings, data_df = load_data_and_embeddings()
     spell_checker = load_spell_checker()
+    # --- User Inputs: Search Bar and Slider ---
     col1, col2 = st.columns([4, 1])
     with col1:
         search_query = st.text_input(
         top_k_results = st.number_input(
             "Number of results",
             min_value=1,
+            max_value=100, # Set a reasonable max
             value=10,
             help="Select the number of top results to display."
         )
     if search_query:
+        # --- Perform Typo Correction ---
         corrected_query = correct_query_spelling(search_query, spell_checker)
+        # If a correction was made, notify the user
         if corrected_query.lower() != search_query.lower():
             st.info(f"Did you mean: **{corrected_query}**? \n\n*Showing results for the corrected query.*")
+        final_query = corrected_query
+        # --- Perform Search ---
+        search_results = semantic_search(final_query, model, corpus_embeddings, data_df, top_k=top_k_results)
+        st.subheader(f"Found {len(search_results)} results for '{final_query}'")
+        # --- Display Results ---
         if search_results:
             for result in search_results:
                 with st.container(border=True):
+                    # Title as a clickable link
                     st.markdown(f"### [{result['title']}]({result['link']})")
+                    # Authors
                     st.caption(f"**Authors:** {result['authors']}")
+                    # Expander for the abstract
                     if pd.notna(result['abstract']):
                         with st.expander("View Abstract"):
                             st.write(result['abstract'])
             st.warning("No results found. Try a different query.")
 except Exception as e:
+    st.error(f"An error occurred: {e}")
     st.info("Please ensure all required libraries are installed and the CSV file is present in your repository.")