Spaces:

ccm
/

chat-with-publications

Runtime error

App Files Files Community

ccm commited on Jul 25, 2024

Commit

6dbf6d3

verified ·

1 Parent(s): d227ba0

Update main.py

Browse files

Files changed (1) hide show

main.py +45 -17

main.py CHANGED Viewed

@@ -17,18 +17,23 @@ GREETING = (
     "https://en.wikipedia.org/wiki/Retrieval-augmented_generation) pipeline to answer questions about research by the "
     "Design Research Collective. And the best part is that I always cite my sources! What can I tell you about today?"
 )
 EMBEDDING_MODEL_NAME = "allenai-specter"
 LLM_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
 # Load the dataset and convert to pandas
-full_data = datasets.load_dataset("ccm/publications")["train"].to_pandas()
 # Filter out any publications without an abstract
 abstract_is_null = [
-    '"abstract": null' in json.dumps(bibdict)
-    for bibdict in full_data["bib_dict"].values
 ]
-data = full_data[~pandas.Series(abstract_is_null)]
 data.reset_index(inplace=True)
 # Create a FAISS index for fast similarity search
@@ -44,8 +49,15 @@ index.add(vectors)
 model = sentence_transformers.SentenceTransformer(EMBEDDING_MODEL_NAME)
-# Define the search function
-def search(query: str, k: int) -> tuple[str]:
     query = numpy.expand_dims(model.encode(query), axis=0)
     faiss.normalize_L2(query)
     D, I = index.search(query, k)
@@ -100,20 +112,40 @@ chatmodel = transformers.AutoModelForCausalLM.from_pretrained(
 )
-def preprocess(message: str) -> tuple[str]:
-    """Applies a preprocessing step to the user's message before the LLM receives it"""
     block_search_results, formatted_search_results = search(message, 5)
     return block_search_results + message, formatted_search_results
 def postprocess(response: str, bypass_from_preprocessing: str) -> str:
-    """Applies a postprocessing step to the LLM's response before the user receives it"""
     return response + bypass_from_preprocessing
 @spaces.GPU
-def predict(message: str, history: list[str]) -> str:
-    """This function is responsible for crafting a response"""
     # Apply preprocessing
     message, bypass = preprocess(message)
@@ -150,12 +182,8 @@ def predict(message: str, history: list[str]) -> str:
 # Create and run the gradio interface
 gradio.ChatInterface(
-    predict,
-    examples=[
-        "Tell me about new research at the intersection of additive manufacturing and machine learning",
-        "What is a physics-informed neural network and what can it be used for?",
-        "What can agent-based models do about climate change?",
-    ],
     chatbot=gradio.Chatbot(
         show_label=False, show_copy_button=True, value=[["", GREETING]]
     ),

     "https://en.wikipedia.org/wiki/Retrieval-augmented_generation) pipeline to answer questions about research by the "
     "Design Research Collective. And the best part is that I always cite my sources! What can I tell you about today?"
 )
+EXAMPLE_QUERIES = [
+    "Tell me about new research at the intersection of additive manufacturing and machine learning",
+    "What is a physics-informed neural network and what can it be used for?",
+    "What can agent-based models do about climate change?",
+]
 EMBEDDING_MODEL_NAME = "allenai-specter"
 LLM_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
+# LLM_MODEL_NAME = "Qwen/Qwen2-0.5B-Instruct"
 # Load the dataset and convert to pandas
+data = datasets.load_dataset("ccm/publications")["train"].to_pandas()
 # Filter out any publications without an abstract
 abstract_is_null = [
+    '"abstract": null' in json.dumps(bibdict) for bibdict in data["bib_dict"].values
 ]
+data = data[~pandas.Series(abstract_is_null)]
 data.reset_index(inplace=True)
 # Create a FAISS index for fast similarity search
 model = sentence_transformers.SentenceTransformer(EMBEDDING_MODEL_NAME)
+def search(query: str, k: int) -> tuple[str, str]:
+    """
+    Searches the dataset for the top k most relevant papers to the query
+    Args:
+        query (str): The user's query
+        k (int): The number of results to return
+    Returns:
+        tuple[str, str]: A tuple containing the search results and references
+    """
     query = numpy.expand_dims(model.encode(query), axis=0)
     faiss.normalize_L2(query)
     D, I = index.search(query, k)
 )
+def preprocess(message: str) -> tuple[str, str]:
+    """
+    Applies a preprocessing step to the user's message before the LLM receives it
+    Args:
+        message (str): The user's message
+    Returns:
+        tuple[str, str]: A tuple containing the preprocessed message and a bypass variable
+    """
     block_search_results, formatted_search_results = search(message, 5)
     return block_search_results + message, formatted_search_results
 def postprocess(response: str, bypass_from_preprocessing: str) -> str:
+    """
+    Applies a postprocessing step to the LLM's response before the user receives it
+    Args:
+        response (str): The LLM's response
+        bypass_from_preprocessing (str): The bypass variable from the preprocessing step
+    Returns:
+        str: The postprocessed response
+    """
     return response + bypass_from_preprocessing
 @spaces.GPU
+def reply(message: str, history: list[str]) -> str:
+    """
+    This function is responsible for crafting a response
+    Args:
+        message (str): The user's message
+        history (list[str]): The conversation history
+    Returns:
+        str: The AI's response
+    """
     # Apply preprocessing
     message, bypass = preprocess(message)
 # Create and run the gradio interface
 gradio.ChatInterface(
+    reply,
+    examples=EXAMPLE_QUERIES,
     chatbot=gradio.Chatbot(
         show_label=False, show_copy_button=True, value=[["", GREETING]]
     ),