Spaces:

TillLangbein
/

The_DORACLE

Sleeping

App Files Files Community

TillLangbein commited on Nov 21, 2024

Commit

1c3ef38

1 Parent(s): cfa680c

Reworked citation system

Browse files

Files changed (2) hide show

app.py +105 -52
prompts.py +23 -8

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import getpass
 import os
 import random
 from langchain_openai import ChatOpenAI
 from langchain_core.globals import set_llm_cache
 from langchain_community.cache import SQLiteCache
 from langchain_community.vectorstores import FAISS
 from langchain_openai import OpenAIEmbeddings
@@ -65,6 +67,14 @@ class GradeAnswer(BaseModel):
         description="Answer addresses the question, 'yes' or 'no'"
     )
 class GraphState(TypedDict):
     """
     Represents the state of our graph.
@@ -82,6 +92,7 @@ class GraphState(TypedDict):
     dora_docs: List[str]
     dora_rts_docs: List[str]
     dora_news_docs: List[str]
 def _set_env(var: str):
     if os.environ.get(var):
@@ -92,13 +103,13 @@ def load_vectorstores(paths: list):
     # The dora vectorstore
     embd = OpenAIEmbeddings()
     model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
-    compressor = CrossEncoderReranker(model=model, top_n=7)
     vectorstores = [FAISS.load_local(path, embd, allow_dangerous_deserialization=True) for path in paths]
     base_retrievers = [vectorstore.as_retriever(search_type="mmr", search_kwargs={
-            "k": 10,
-            "fetch_k": 20,
-            "score_threshold": 0.7,
     }) for vectorstore in vectorstores]
     retrievers = [ContextualCompressionRetriever(
@@ -106,7 +117,48 @@ def load_vectorstores(paths: list):
     ) for retriever in base_retrievers]
     return retrievers
 # Put all chains in fuctions
 def dora_rewrite(state):
     """
@@ -168,8 +220,14 @@ def generate(state):
     documents = state["documents"]
     # RAG generation
-    generation = answer_chain.invoke({"context": documents, "question": question})
-    return {"generation": generation}
 def transform_query(state):
     """
@@ -271,10 +329,7 @@ def grade_generation_v_documents_and_question(state):
             print("---DECISION: GENERATION DOES NOT ADDRESS QUESTION---")
             return "not useful"
     else:
-        for document in documents:
-            print(document.page_content)
         print("---DECISION: THOSE DOCUMENTS ARE NOT GROUNDING THIS GENERATION---")
-        print(f"{generation = }")
         return "not supported"
 # Then compile the graph
@@ -308,7 +363,7 @@ def compile_graph():
         "generate",
         grade_generation_v_documents_and_question,
         {
-            "not supported": "generate",
             "useful": END,
             "not useful": "transform_query",
         },
@@ -323,19 +378,20 @@ def generate_response(question: str, dora: bool, rts: bool, news: bool):
     state = app.invoke({"question": question, "selected_sources": selected_sources})
     return (
         state["generation"],
-        ('\n\n'.join([f"***{doc.metadata['source']} section {doc.metadata['section']}***: {doc.page_content}" for doc in state["dora_docs"]])) if "dora_docs" in state and state["dora_docs"] else 'No documents available.',
-        ('\n\n'.join([f"***{doc.metadata['source']}, section {doc.metadata['section']}***: {doc.page_content}" for doc in state["dora_rts_docs"]])) if "dora_rts_docs" in state and state["dora_rts_docs"] else 'No documents available.',
-        ('\n\n'.join([f"***{doc.metadata['source']}***: {doc.page_content}" for doc in state["dora_news_docs"]])) if "dora_news_docs" in state and state["dora_news_docs"] else 'No documents available.',
     )
 def show_loading(prompt: str):
-    return [prompt, "loading", "loading", "loading", "loading"]
 def on_click():
     return "I would love to hear your opinion: \[email protected]"
 def clear_results():
-    return "", "", "", "", ""
 def random_prompt():
     return random.choice([
@@ -360,8 +416,31 @@ def load_css():
     with open('./style.css', 'r') as file:
         return file.read()
-def run_gradio():
-    with gr.Blocks(title='Artificial Compliance', css=load_css(), fill_width=True, fill_height=True,) as gradio_ui:
         # theme=gr.themes.Monochrome(),
         # Adding a sliding navbar
         with gr.Column(scale=1, elem_id='navbar'):
@@ -401,11 +480,11 @@ def run_gradio():
                 llm_generation = gr.Markdown(label="LLM Generation", elem_id="llm_generation")
             gr.Markdown("----------------------------------------------------------------------------")
-            with gr.Row(elem_id='text_block'):
-                dora_documents = gr.Markdown(label="DORA Documents")
-                dora_rts_documents = gr.Markdown(label="DORA RTS Documents")
-                dora_news_documents = gr.Markdown(label="Bafin supporting Documents")
         # Adding a footer with impressum and contact
         with gr.Row(elem_classes="footer"):
@@ -415,10 +494,10 @@ def run_gradio():
         gr.on(
             triggers=[question_prompt.submit, submit_button.click],
             inputs=[question_prompt],
-            outputs=[display_prompt, llm_generation, dora_documents, dora_rts_documents, dora_news_documents],
             fn=show_loading
         ).then(
-            outputs=[llm_generation, dora_documents, dora_rts_documents, dora_news_documents],
             inputs=[question_prompt, dora_chatbot_button, document_workbench_button, newsfeed_button],
             fn=generate_response
         )
@@ -431,35 +510,9 @@ def run_gradio():
         )
         # Clearing out all results when the appropriate button is clicked
-        clear_results_button.click(fn=clear_results, outputs=[display_prompt, llm_generation, dora_documents, dora_rts_documents, dora_news_documents])
-    gradio_ui.launch()
-if __name__ == "__main__":
-    _set_env("OPENAI_API_KEY")
-    set_llm_cache(SQLiteCache(database_path=".cache.db"))
-    dora_retriever, dora_rts_retriever, dora_news_retriever = load_vectorstores(
-        ["./dora_vectorstore_data_faiss.vst",
-        "./rts_eur_lex_vectorstore_faiss.vst",
-        "./bafin_news_vectorstore_faiss.vst",]
-    )
-    fast_llm = ChatOpenAI(model="gpt-3.5-turbo")
-    tool_llm = ChatOpenAI(model="gpt-4o")
-    rewrite_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=1, cache=False)
-    dora_question_rewriter = IMPROVE_PROMPT | tool_llm | StrOutputParser()
-    answer_chain = ANSWER_PROMPT | tool_llm | StrOutputParser()
-    hallucination_grader = HALLUCINATION_PROMPT | fast_llm.with_structured_output(GradeHallucinations)
-    answer_grader = RESOLVER_PROMPT | fast_llm.with_structured_output(GradeAnswer)
-    question_rewriter = REWRITER_PROMPT | rewrite_llm | StrOutputParser()
-    app = compile_graph()
-    # And finally, run the app
-    run_gradio()

 import getpass
 import os
 import random
+import re
 from langchain_openai import ChatOpenAI
 from langchain_core.globals import set_llm_cache
+from langchain_core.documents import Document
 from langchain_community.cache import SQLiteCache
 from langchain_community.vectorstores import FAISS
 from langchain_openai import OpenAIEmbeddings
         description="Answer addresses the question, 'yes' or 'no'"
     )
+class AnswerWithCitations(BaseModel):
+    answer: str = Field(
+        description="Comprehensive answer to the user's question with citations.",
+    )
+    citations: List[str] = Field(
+        description="List of the first 20 characters of sources cited in the answer."
+    )
 class GraphState(TypedDict):
     """
     Represents the state of our graph.
     dora_docs: List[str]
     dora_rts_docs: List[str]
     dora_news_docs: List[str]
+    citations: List[str]
 def _set_env(var: str):
     if os.environ.get(var):
     # The dora vectorstore
     embd = OpenAIEmbeddings()
     model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
+    compressor = CrossEncoderReranker(model=model, top_n=4)
     vectorstores = [FAISS.load_local(path, embd, allow_dangerous_deserialization=True) for path in paths]
     base_retrievers = [vectorstore.as_retriever(search_type="mmr", search_kwargs={
+            "k": 7,
+            "fetch_k": 10,
+            "score_threshold": 0.8,
     }) for vectorstore in vectorstores]
     retrievers = [ContextualCompressionRetriever(
     ) for retriever in base_retrievers]
     return retrievers
+def starts_with_ignoring_blanks(full_text, prefix):
+    # Normalize all types of blanks to regular spaces
+    normalized_full_text = re.sub(r'\s+', ' ', full_text.strip())
+    normalized_prefix = re.sub(r'\s+', ' ', prefix.strip())
+    # Check if the normalized full text starts with the normalized prefix
+    return normalized_full_text.startswith(normalized_prefix)
+def match_citations_to_documents(citations: List[str], documents: List[Document]):
+    """
+    Matches the citations to the documents by searching for the source and section in the documents
+    Args:
+        citations (List[str]): List of citations to match
+        documents (List[Document]): List of documents to search in
+    Returns:
+        dict: Dictionary with the matched documents, where the key is the citation number and the value is the matched document
+    """
+    matched_documents = {}
+    for num, citation in enumerate(citations, 1):
+        # Extract the relevant parts from the citation (source and section)
+        print(f"checking the {num} citation: {citation}")
+        for doc in documents:
+            print(f"Does this: '{doc.page_content[:30]}' starts with this: '{citation}'?")
+            print(f"{doc.page_content[:40] =}")
+            print(f"{citation} =")
+            print(f"{doc.page_content[:40].startswith(citation) =}")
+            if starts_with_ignoring_blanks(doc.page_content[:40], citation): #Strangely, the 25 of the citation often become 35
+                print("yes")
+                if doc.metadata.get("section", None):
+                    matched_documents[f"<sup>{num}</sup>"] = f"***{doc.metadata['source']} section {doc.metadata['section']}***: {doc.page_content}"
+                else:
+                    matched_documents[f"<sup>{num}</sup>"] = f"***{doc.metadata['source']}***: {doc.page_content}"
+                break
+            else:
+                print("no")
+    return matched_documents
 # Put all chains in fuctions
 def dora_rewrite(state):
     """
     documents = state["documents"]
     # RAG generation
+    answer = answer_chain.invoke({"context": documents, "question": question})
+    generation = answer.answer
+    print(f"{answer.citations = }")
+    citations = match_citations_to_documents(answer.citations, documents)
+    print(f"{len(citations)} found, is that correct?")
+    return {"generation": generation, "citations": citations}
 def transform_query(state):
     """
             print("---DECISION: GENERATION DOES NOT ADDRESS QUESTION---")
             return "not useful"
     else:
         print("---DECISION: THOSE DOCUMENTS ARE NOT GROUNDING THIS GENERATION---")
         return "not supported"
 # Then compile the graph
         "generate",
         grade_generation_v_documents_and_question,
         {
+            "not supported": "transform_query",
             "useful": END,
             "not useful": "transform_query",
         },
     state = app.invoke({"question": question, "selected_sources": selected_sources})
     return (
         state["generation"],
+        ('\n\n'.join([f"{num} - {doc}" for num, doc in state["citations"].items()])) if "citations" in state and state["citations"] else 'No citations available.',
+        # ('\n\n'.join([f"***{doc.metadata['source']} section {doc.metadata['section']}***: {doc.page_content}" for doc in state["dora_docs"]])) if "dora_docs" in state and state["dora_docs"] else 'No documents available.',
+        # ('\n\n'.join([f"***{doc.metadata['source']}, section {doc.metadata['section']}***: {doc.page_content}" for doc in state["dora_rts_docs"]])) if "dora_rts_docs" in state and state["dora_rts_docs"] else 'No documents available.',
+        # ('\n\n'.join([f"***{doc.metadata['source']}***: {doc.page_content}" for doc in state["dora_news_docs"]])) if "dora_news_docs" in state and state["dora_news_docs"] else 'No documents available.',
     )
 def show_loading(prompt: str):
+    return [prompt, "loading", "loading"]
 def on_click():
     return "I would love to hear your opinion: \[email protected]"
 def clear_results():
+    return "", "", ""
 def random_prompt():
     return random.choice([
     with open('./style.css', 'r') as file:
         return file.read()
+if __name__ == "__main__":
+    _set_env("OPENAI_API_KEY")
+    set_llm_cache(SQLiteCache(database_path=".cache.db"))
+    dora_retriever, dora_rts_retriever, dora_news_retriever = load_vectorstores(
+        ["./dora_vectorstore_data_faiss.vst",
+        "./rts_eur_lex_vectorstore_faiss.vst",
+        "./bafin_news_vectorstore_faiss.vst",]
+    )
+    fast_llm = ChatOpenAI(model="gpt-3.5-turbo")
+    tool_llm = ChatOpenAI(model="gpt-4o")
+    rewrite_llm = ChatOpenAI(model="gpt-4o", temperature=1, cache=False)
+    dora_question_rewriter = IMPROVE_PROMPT | tool_llm | StrOutputParser()
+    answer_chain = ANSWER_PROMPT | tool_llm.with_structured_output(
+            AnswerWithCitations, include_raw=False
+            ).with_config(run_name="GenerateAnswer")
+    hallucination_grader = HALLUCINATION_PROMPT | fast_llm.with_structured_output(GradeHallucinations)
+    answer_grader = RESOLVER_PROMPT | fast_llm.with_structured_output(GradeAnswer)
+    question_rewriter = REWRITER_PROMPT | rewrite_llm | StrOutputParser()
+    app = compile_graph()
+    with gr.Blocks(title='Artificial Compliance', css=load_css(), fill_width=True, fill_height=True,) as demo:
         # theme=gr.themes.Monochrome(),
         # Adding a sliding navbar
         with gr.Column(scale=1, elem_id='navbar'):
                 llm_generation = gr.Markdown(label="LLM Generation", elem_id="llm_generation")
             gr.Markdown("----------------------------------------------------------------------------")
+            with gr.Row(elem_id='text_block'):
+                citations = gr.Markdown(label="citations", elem_id="llm_generation")
+            gr.Markdown("----------------------------------------------------------------------------")
         # Adding a footer with impressum and contact
         with gr.Row(elem_classes="footer"):
         gr.on(
             triggers=[question_prompt.submit, submit_button.click],
             inputs=[question_prompt],
+            outputs=[display_prompt, llm_generation, citations],
             fn=show_loading
         ).then(
+            outputs=[llm_generation, citations],
             inputs=[question_prompt, dora_chatbot_button, document_workbench_button, newsfeed_button],
             fn=generate_response
         )
         )
         # Clearing out all results when the appropriate button is clicked
+        clear_results_button.click(fn=clear_results, outputs=[display_prompt, llm_generation, citations])
+    demo.launch()

prompts.py CHANGED Viewed

@@ -20,14 +20,29 @@ ANSWER_PROMPT = ChatPromptTemplate.from_messages(
     [
         (
             "system",
-            "You are a highly experienced IT auditor, specializing in information security and regulatory compliance. "
-            "Your task is to assist a colleague who has approached you with a question. "
-            "You have access to relevant context, provided here: {context}. "
-            "Make your response as informative as possible and make sure every sentence is supported by the provided context."
-            "Each information must be backed up by a citation from at least one of the information sources in the context, formatted as a footnote, reproducing the source after your response."
-            "Your answer should be structured and suitable for regulatory documentation or audit reporting. "
-            "If you do not have a citation from the provided source material in the message, explicitly state: 'No citations found.' Never generate a citation if no source material is provided."
-            "Ensure all relevant details from the context are included in your response."
         ),
         ("user", "{question}"),
     ]

     [
         (
             "system",
+            """You are an experienced IT auditor specializing in information security and regulatory compliance.
+            Your task is to assist a colleague who has a question. You have access to the following context: {context}.
+            Ensure your response is comprehensive and as many information from the context as possible are included.
+            Strive to include citations from as many different documents as relevant.
+            Make your response as informative as possible and make sure every sentence is supported by the provided information.
+            Each claim in the response must be backed up by a citation from at least one of the information sources.
+            Each citation should be the first 20 characters from the source content used.
+            If you do not have a citation from the provided source material in the message, explicitly state: 'No citations found.' Never generate a citation if no source material is provided.
+            Example Answer:
+            Deploying a Security Information and Event Management (SIEM) system with Extended Detection and Response (XDR) is ok <sup>[1]</sup>. But it is not ok to deploy a SIEM system with Extended Incident Management (XIM) <sup>[^2]</sup>.
+            Example Footnotes:
+            [^1]: "Article\xa08Identification1."
+            [^2]: "Article\xa029Preliminary ass"
+            Example Answer 2:
+            The Digital Operational Resilience Act (DORA) outlines several key requirements and obligations for ICT risk management within financial entities <sup>[1]</sup>. One of the primary obligations is the implementation of ICT security policies <sup>[^2]</sup>.
+            Example Footnotes 2:
+            [^1]: "the implementation of the"
+            [^2]: "(EU) 2022/2554;(i)the cla"
+            """
         ),
         ("user", "{question}"),
     ]