Spaces:

mtyrrell
/

cpv_poc

Sleeping

App Files Files Community

mtyrrell commited on Sep 29, 2024

Commit

cd9150d

1 Parent(s): ab45f35

pinecone serveless migration w langchain

Browse files

Files changed (3) hide show

.DS_Store +0 -0
app.py +118 -110
env +5 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py CHANGED Viewed

@@ -3,29 +3,29 @@ import streamlit as st
 import os
 import pkg_resources
-# Using this wacky hack to get around the massively ridicolous managed env loading order
-def is_installed(package_name, version):
-    try:
-        pkg = pkg_resources.get_distribution(package_name)
-        return pkg.version == version
-    except pkg_resources.DistributionNotFound:
-        return False
-@st.cache_resource
-def install_packages():
-    install_commands = []
-    if not is_installed("spaces", "0.12.0"):
-        install_commands.append("pip install spaces==0.12.0")
-    if not is_installed("pydantic", "1.8.2"):
-        install_commands.append("pip install pydantic==1.8.2")
-    if install_commands:
-        os.system(" && ".join(install_commands))
-# install packages if necessary
-install_packages()
 import re
@@ -33,58 +33,96 @@ import json
 from dotenv import load_dotenv
 import numpy as np
 import pandas as pd
-from haystack.schema import Document
-from haystack.document_stores import PineconeDocumentStore
-from haystack.nodes import EmbeddingRetriever
-import openai
-# for local st testing, may need to run source ~/.zshrc to point to env vars
-# Get openai API key
-openai.api_key = os.environ["OPENAI_API_KEY"]
-# Get openai API key
-pinecone_key = os.environ["PINECONE_API_KEY"]
-#___________________________________________________________________________________________________________
-# @st.cache_resource
-# def get_document_store():
-#     doc_file_name="cpv_full_southern_africa"
-#     document_store = PineconeDocumentStore(api_key=pinecone_key,
-#                                        environment="asia-southeast1-gcp-free",
-#                                        index=doc_file_name)
-#     return document_store
-# # Get (or initialize and get) the document store
-# document_store = get_document_store()
-@st.cache_resource
-def get_retriever():
-    doc_file_name="cpv_full_southern_africa"
-    document_store = PineconeDocumentStore(api_key=pinecone_key,
-                                       environment="asia-southeast1-gcp-free",
-                                       index=doc_file_name)
-    retriever = EmbeddingRetriever(
-        document_store=document_store,
-        embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
-        model_format="sentence_transformers",
-        progress_bar=False,
     )
-    return retriever
-retriever = get_retriever()
-# # Instantiate retriever
-# retriever = EmbeddingRetriever(
-#   document_store=document_store,
-#   embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
-#   model_format="sentence_transformers",
-#   progress_bar=False,
-# )
 prompt_template="Answer the given question using the following documents. \
 Formulate your answer in the style of an academic report. \
@@ -112,40 +150,6 @@ examples = [
 ]
-def get_docs(input_query, country = [], vulnerability_cat = []):
-  if not country:
-    country = "All Countries"
-  if not vulnerability_cat:
-    if country == "All Countries":
-      filters = None
-    else:
-      filters = {'country': {'$in': country}}
-  else:
-    if country == "All Countries":
-          filters = {'vulnerability_cat': {'$in': vulnerability_cat}}
-    else:
-      filters = {'country': {'$in': country},'vulnerability_cat': {'$in': vulnerability_cat}}
-  docs = retriever.retrieve(query=input_query, filters = filters, top_k = 10)
-  # Break out the key fields and convert to pandas for filtering
-  docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs]
-  df_docs = pd.DataFrame(docs)
-  # Get ourselves an index setup from which to base the source reference number from (in the prompt and matching afterwards)
-  df_docs = df_docs.reset_index()
-  df_docs['ref_id'] = df_docs.index + 1 # start the index at 1
-  # Convert back to Document format
-  ls_dict = []
-  # Iterate over df and add relevant fields to the dict object
-  for index, row in df_docs.iterrows():
-      # Create a Document object for each row
-      doc = Document(
-          row['content'],
-          meta={'country': row['country'],'document': row['document'], 'page': row['page'], 'file_name': row['file_name'], 'ref_id': row['ref_id'], 'vulnerability_cat': row['vulnerability_cat'], 'score': row['score']}
-      )
-      # Append the Document object to the documents list
-      ls_dict.append(doc)
-  return ls_dict
 def get_refs(docs, res):
   '''
   Parse response for engineered reference ids (refer to prompt template)
@@ -159,40 +163,44 @@ def get_refs(docs, res):
   # extract
   result_str = ""  # Initialize an empty string to store the result
   for i in range(len(docs)):
-      doc = docs[i].to_dict()
-      ref_id = doc['meta']['ref_id']
       if ref_id in ref_ids:
-        if doc['meta']['document'] == "Supplementary":
-            result_str += "**Ref. " + str(ref_id) + " [" + doc['meta']['country'] + " " + doc['meta']['document'] + ':' + doc['meta']['file_name'] + ' p' + str(doc['meta']['page']) + '; vulnerabilities: ' + doc['meta']['vulnerability_cat'] + "]:** " + "*'" + doc['content'] + "'*<br> <br>" # Add <br> for a line break
         else:
-            result_str += "**Ref. " + str(ref_id) + " [" + doc['meta']['country'] + " " + doc['meta']['document'] + ' p' + str(doc['meta']['page']) + '; vulnerabilities: ' + doc['meta']['vulnerability_cat'] + "]:** " + "*'" + doc['content'] + "'*<br> <br>" # Add <br> for a line break
   return result_str
 # define a special function for putting the prompt together (as we can't use haystack)
 def get_prompt(docs, input_query):
   base_prompt=prompt_template
-  # Add the meta data for references
-  context = ' - '.join(['&&& [ref. '+str(d.meta['ref_id'])+'] '+d.meta['document']+' &&&: '+d.content for d in docs])
   prompt = base_prompt+"; Context: "+context+"; Question: "+input_query+"; Answer:"
   return(prompt)
-def run_query(input_text, country, model_sel):
     # first call the retriever function using selected filters
-    docs = get_docs(input_text, country=country,vulnerability_cat=vulnerabilities_cat)
     # model selector (not currently being used)
     if model_sel == "chatGPT":
         # instantiate ChatCompletion as a generator object (stream is set to True)
-        response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": get_prompt(docs, input_text)}], stream=True)
         # iterate through the streamed output
         report = []
-        for chunk in response:
-            # extract the object containing the text (totally different structure when streaming)
-            chunk_message = chunk['choices'][0]['delta']
-            # test to make sure there is text in the object (some don't have)
-            if 'content' in chunk_message:
-                report.append(chunk_message.content) # extract the message
-                # add the latest text and merge it with all previous
                 result = "".join(report).strip()
                 res_box.success(result) # output to response text box

 import os
 import pkg_resources
+# # Using this wacky hack to get around the massively ridicolous managed env loading order
+# def is_installed(package_name, version):
+#     try:
+#         pkg = pkg_resources.get_distribution(package_name)
+#         return pkg.version == version
+#     except pkg_resources.DistributionNotFound:
+#         return False
+# @st.cache_resource
+# def install_packages():
+#     install_commands = []
+#     if not is_installed("spaces", "0.12.0"):
+#         install_commands.append("pip install spaces==0.12.0")
+#     if not is_installed("pydantic", "1.8.2"):
+#         install_commands.append("pip install pydantic==1.8.2")
+#     if install_commands:
+#         os.system(" && ".join(install_commands))
+# # install packages if necessary
+# # install_packages()
 import re
 from dotenv import load_dotenv
 import numpy as np
 import pandas as pd
+import getpass
+import os
+from dotenv import load_dotenv, find_dotenv
+from pinecone import Pinecone, ServerlessSpec
+from langchain_pinecone import PineconeVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+# from langchain_core.output_parsers import StrOutputParser
+# from langchain_core.runnables import RunnablePassthrough
+# from langchain_openai import ChatOpenAI
+from langchain.docstore.document import Document
+from openai import OpenAI
+client = OpenAI(
+  organization='org-x0YBcOjkdPyf6ExxWCkmFHAj',
+  project='proj_40oH22n9XudeKL2rgka1IQ5B',
+  api_key='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'
+)
+pinecone_api_key = os.environ.get("PINECONE_API_KEY")
+@st.cache_resource
+def initialize_embeddings(model_name: str = "all-mpnet-base-v2"):
+    embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    return embeddings
+@st.cache_resource
+def initialize_vector_store(pinecone_api_key: str, index_name: str):
+    # Initialize Pinecone
+    pc = Pinecone(api_key=pinecone_api_key)
+    # Access the index
+    index = pc.Index(index_name)
+    # Use the cached embeddings
+    embeddings = initialize_embeddings()
+    # Create the vector store
+    vector_store = PineconeVectorStore(index=index, embedding=embeddings, text_key='content')
+    return vector_store, embeddings
+# Unpack the tuple into both vector_store and embeddings
+vector_store, embeddings = initialize_vector_store(pinecone_api_key, index_name="cpv-full-southern-africa-test")
+def get_docs(query, country = [], vulnerability_cat = []):
+    if not country:
+        country = "All Countries"
+    if not vulnerability_cat:
+        if country == "All Countries":
+            filters = None
+        else:
+            filters = {'country': {'$in': country}}
+    else:
+        if country == "All Countries":
+            filters = {'vulnerability_cat': {'$in': vulnerability_cat}}
+        else:
+            filters = {'country': {'$in': country},'vulnerability_cat': {'$in': vulnerability_cat}}
+    docs = vector_store.similarity_search_by_vector_with_score(
+        embeddings.embed_query(query),
+        k=20,
+        filter=filters,
     )
+    # Break out the key fields and convert to pandas for filtering
+    docs_dict = [{**x[0].metadata,"score":x[1],"content":x[0].page_content} for x in docs]
+    df_docs = pd.DataFrame(docs_dict)
+    # Get ourselves an index setup from which to base the source reference number from (in the prompt and matching afterwards)
+    df_docs = df_docs.reset_index()
+    df_docs['ref_id'] = df_docs.index + 1 # start the index at 1
+    # Convert back to Document format
+    ls_dict = []
+    # Iterate over df and add relevant fields to the dict object
+    for index, row in df_docs.iterrows():
+        # Create a Document object for each row
+        doc = Document(
+            page_content = row['content'],
+            metadata={'country': row['country'],'document': row['document'], 'page': row['page'], 'file_name': row['file_name'], 'ref_id': row['ref_id'], 'vulnerability_cat': row['vulnerability_cat'], 'score': row['score']}
+        )
+        # Append the Document object to the documents list
+        ls_dict.append(doc)
+    return ls_dict
 prompt_template="Answer the given question using the following documents. \
 Formulate your answer in the style of an academic report. \
 ]
 def get_refs(docs, res):
   '''
   Parse response for engineered reference ids (refer to prompt template)
   # extract
   result_str = ""  # Initialize an empty string to store the result
   for i in range(len(docs)):
+      ref_id = docs[i].metadata['ref_id']
       if ref_id in ref_ids:
+        if docs[i].metadata['document'] == "Supplementary":
+            result_str += "**Ref. " + str(ref_id) + " [" + docs[i].metadata['country'] + " " + docs[i].metadata['document'] + ':' + docs[i].metadata['file_name'] + ' p' + str(docs[i].metadata['page']) + '; vulnerabilities: ' + docs[i].metadata['vulnerability_cat'] + "]:** " + "*'" + docs[i].page_content + "'*<br> <br>" # Add <br> for a line break
         else:
+            result_str += "**Ref. " + str(ref_id) + " [" + docs[i].metadata['country'] + " " + docs[i].metadata['document'] + ' p' + str(docs[i].metadata['page']) + '; vulnerabilities: ' + docs[i].metadata['vulnerability_cat'] + "]:** " + "*'" + docs[i].page_content + "'*<br> <br>" # Add <br> for a line break
   return result_str
 # define a special function for putting the prompt together (as we can't use haystack)
 def get_prompt(docs, input_query):
   base_prompt=prompt_template
+  # Add the metadata data for references
+  context = ' - '.join(['&&& [ref. '+str(d.metadata['ref_id'])+'] '+d.metadata['document']+' &&&: '+d.page_content for d in docs])
   prompt = base_prompt+"; Context: "+context+"; Question: "+input_query+"; Answer:"
   return(prompt)
+def run_query(query, country, model_sel):
     # first call the retriever function using selected filters
+    docs = get_docs(query, country=country,vulnerability_cat=vulnerabilities_cat)
     # model selector (not currently being used)
     if model_sel == "chatGPT":
         # instantiate ChatCompletion as a generator object (stream is set to True)
+        # response = openai.ChatCompletion.create(model="gpt-4o-mini-2024-07-18", messages=[{"role": "user", "content": get_prompt(docs, query)}], stream=True)
+        stream = client.chat.completions.create(
+            model="gpt-4o-mini-2024-07-18",
+            messages=[{"role": "user", "content": get_prompt(docs, query)}],
+            stream=True,
+        )
         # iterate through the streamed output
         report = []
+        for chunk in stream:
+            if chunk.choices[0].delta.content is not None:
+                # print(chunk.choices[0].delta.content, end="")
+                report.append(chunk.choices[0].delta.content)
                 result = "".join(report).strip()
                 res_box.success(result) # output to response text box

env ADDED Viewed

	@@ -0,0 +1,5 @@

+OPENAI_API_KEY="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"
+HF_API_KEY="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"
+PINECONE_API_KEY="c3f5717c-f43a-46d0-893e-02b44dbcf13b"
+USER1_HASH="$2b$12$hZbOi6zKmQQWvvpcllds9uAB3ili66N0aQyPzuDctl7IkNhl226oG"
+USER2_HASH="$2b$12$kWnArbA.2QTkpMv2yvE2J.7UJw0Fgc/3FH1k5JRqhjg.cvytriGt2"