Spaces:

OrganizedProgrammers
/

kig_test

Sleeping

App Files Files Community

adrienbrdne commited on Apr 14

Commit

70d06c8

verified ·

1 Parent(s): f8ac349

Upload 5 files

Browse files

Files changed (5) hide show

ki_gen/data_processor.py +185 -0
ki_gen/data_retriever.py +406 -0
ki_gen/planner.py +282 -0
ki_gen/prompts.py +155 -0
ki_gen/utils.py +152 -0

ki_gen/data_processor.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python
+# coding: utf-8
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_groq import ChatGroq
+from langgraph.graph import StateGraph
+from llmlingua import PromptCompressor
+from ki_gen.utils import ConfigSchema, DocProcessorState, get_model, format_doc
+from langgraph.checkpoint.sqlite import SqliteSaver
+# compressed_prompt = llm_lingua.compress_prompt(prompt, instruction="", question="", target_token=200)
+## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory.
+## Before that, you need to pip install optimum auto-gptq
+# llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
+# Requires ~2GB of RAM
+def get_llm_lingua(compress_method:str = "llm_lingua2"):
+    # Requires ~2GB memory
+    if compress_method == "llm_lingua2":
+        llm_lingua2 = PromptCompressor(
+            model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
+            use_llmlingua2=True,
+            device_map="cpu"
+        )
+        return llm_lingua2
+    # Requires ~8GB memory
+    elif compress_method == "llm_lingua":
+        llm_lingua = PromptCompressor(
+            model_name="microsoft/phi-2",
+            device_map="cpu"
+        )
+        return llm_lingua
+    raise ValueError("Incorrect compression method, should be 'llm_lingua' or 'llm_lingua2'")
+def compress(state: DocProcessorState, config: ConfigSchema):
+    """
+    This node compresses last processing result for each doc using llm_lingua
+    """
+    doc_process_histories = state["docs_in_processing"]
+    llm_lingua = get_llm_lingua(config["configurable"].get("compression_method") or "llm_lingua2")
+    for doc_process_history in doc_process_histories:
+        doc_process_history.append(llm_lingua.compress_prompt(
+            doc = str(doc_process_history[-1]),
+            rate=config["configurable"].get("compress_rate") or 0.33,
+            force_tokens=config["configurable"].get("force_tokens") or ['\n', '?', '.', '!', ',']
+            )["compressed_prompt"]
+        )
+    return {"docs_in_processing": doc_process_histories, "current_process_step" : state["current_process_step"] + 1}
+def summarize_docs(state: DocProcessorState, config: ConfigSchema):
+    """
+    This node summarizes all docs in state["valid_docs"]
+    """
+    prompt = """You are a 3GPP standardization expert.
+Summarize the provided document in simple technical English for other experts in the field.
+Document:
+{document}"""
+    sysmsg = ChatPromptTemplate.from_messages([
+        ("system", prompt)
+    ])
+    model = config["configurable"].get("summarize_model") or "deepseek-r1-distill-llama-70b"
+    doc_process_histories = state["docs_in_processing"]
+    if model == "gpt-4o":
+        llm_summarize = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm_summarize = ChatGroq(model=model)
+    summarize_chain = sysmsg | llm_summarize | StrOutputParser()
+    for doc_process_history in doc_process_histories:
+        doc_process_history.append(summarize_chain.invoke({"document" : str(doc_process_history[-1])}))
+    return {"docs_in_processing": doc_process_histories, "current_process_step": state["current_process_step"] + 1}
+def custom_process(state: DocProcessorState):
+    """
+    Custom processing step, params are stored in a dict in state["process_steps"][state["current_process_step"]]
+    processing_model : the LLM which will perform the processing
+    context : the previous processing results to send as context to the LLM
+    user_prompt : the prompt/task which will be appended to the context before sending to the LLM
+    """
+    processing_params = state["process_steps"][state["current_process_step"]]
+    model = processing_params.get("processing_model") or "deepseek-r1-distill-llama-70b"
+    user_prompt = processing_params["prompt"]
+    context = processing_params.get("context") or [0]
+    doc_process_histories = state["docs_in_processing"]
+    if not isinstance(context, list):
+        context = [context]
+    processing_chain = get_model(model=model) | StrOutputParser()
+    for doc_process_history in doc_process_histories:
+        context_str = ""
+        for i, context_element in enumerate(context):
+            context_str += f"### TECHNICAL INFORMATION {i+1} \n {doc_process_history[context_element]}\n\n"
+        doc_process_history.append(processing_chain.invoke(context_str + user_prompt))
+    return {"docs_in_processing" : doc_process_histories, "current_process_step" : state["current_process_step"] + 1}
+def final(state: DocProcessorState):
+    """
+    A node to store the final results of processing in the 'valid_docs' field
+    """
+    return {"valid_docs" : [doc_process_history[-1] for doc_process_history in state["docs_in_processing"]]}
+# TODO : remove this node and use conditional entry point instead
+def get_process_steps(state: DocProcessorState, config: ConfigSchema):
+    """
+    Dummy node
+    """
+    # if not process_steps:
+    #     process_steps = eval(input("Enter processing steps: "))
+    return {"current_process_step": 0, "docs_in_processing" : [[format_doc(doc)] for doc in state["valid_docs"]]}
+def next_processor_step(state: DocProcessorState):
+    """
+    Conditional edge function to go to next processing step
+    """
+    process_steps = state["process_steps"]
+    if state["current_process_step"] < len(process_steps):
+        step = process_steps[state["current_process_step"]]
+        if isinstance(step, dict):
+            step = "custom"
+    else:
+        step = "final"
+    return step
+def build_data_processor_graph(memory):
+    """
+    Builds the data processor graph
+    """
+    #with SqliteSaver.from_conn_string(":memory:") as memory :
+    graph_builder_doc_processor = StateGraph(DocProcessorState)
+    graph_builder_doc_processor.add_node("get_process_steps", get_process_steps)
+    graph_builder_doc_processor.add_node("summarize", summarize_docs)
+    graph_builder_doc_processor.add_node("compress", compress)
+    graph_builder_doc_processor.add_node("custom", custom_process)
+    graph_builder_doc_processor.add_node("final", final)
+    graph_builder_doc_processor.add_edge("__start__", "get_process_steps")
+    graph_builder_doc_processor.add_conditional_edges(
+        "get_process_steps",
+        next_processor_step,
+        {"compress" : "compress", "final": "final", "summarize": "summarize", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_conditional_edges(
+        "summarize",
+        next_processor_step,
+        {"compress" : "compress", "final": "final", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_conditional_edges(
+        "compress",
+        next_processor_step,
+        {"summarize" : "summarize", "final": "final", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_conditional_edges(
+        "custom",
+        next_processor_step,
+        {"summarize" : "summarize", "final": "final", "compress" : "compress", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_edge("final", "__end__")
+    graph_doc_processor = graph_builder_doc_processor.compile(checkpointer=memory)
+    return graph_doc_processor

ki_gen/data_retriever.py ADDED Viewed

	@@ -0,0 +1,406 @@

+#!/usr/bin/env python
+# coding: utf-8
+import re
+import time
+from random import shuffle, sample
+from langgraph.checkpoint.sqlite import SqliteSaver
+from langchain_groq import ChatGroq
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage
+from langchain_community.graphs import Neo4jGraph
+from langchain_community.chains.graph_qa.cypher_utils import CypherQueryCorrector, Schema
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.pydantic_v1 import Field
+from pydantic import BaseModel
+from langchain_groq import ChatGroq
+from langgraph.graph import StateGraph
+from llmlingua import PromptCompressor
+from ki_gen.prompts import (
+    CYPHER_GENERATION_PROMPT,
+    CONCEPT_SELECTION_PROMPT,
+    BINARY_GRADER_PROMPT,
+    SCORE_GRADER_PROMPT,
+    RELEVANT_CONCEPTS_PROMPT,
+)
+from ki_gen.utils import ConfigSchema, DocRetrieverState, get_model, format_doc
+def extract_cypher(text: str) -> str:
+    """Extract Cypher code from a text.
+    Args:
+        text: Text to extract Cypher code from.
+    Returns:
+        Cypher code extracted from the text.
+    """
+    # The pattern to find Cypher code enclosed in triple backticks
+    pattern_1 = r"```cypher\n(.*?)```"
+    pattern_2 = r"```\n(.*?)```"
+    # Find all matches in the input text
+    matches_1 = re.findall(pattern_1, text, re.DOTALL)
+    matches_2 = re.findall(pattern_2, text, re.DOTALL)
+    return [
+        matches_1[0] if matches_1 else text,
+        matches_2[0] if matches_2 else text,
+        text
+    ]
+def get_cypher_gen_chain(model: str = "deepseek-r1-distill-llama-70b"):
+    """
+    Returns cypher gen chain using specified model for generation
+    This is used when the 'auto' cypher generation method has been configured
+    """
+    if model=="openai":
+        llm_cypher_gen = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm_cypher_gen = ChatGroq(model = "deepseek-r1-distill-llama-70b")
+    cypher_gen_chain = CYPHER_GENERATION_PROMPT | llm_cypher_gen | StrOutputParser() | extract_cypher
+    return cypher_gen_chain
+def get_concept_selection_chain(model: str = "deepseek-r1-distill-llama-70b"):
+    """
+    Returns a chain to select the most relevant topic using specified model for generation.
+    This is used when the 'guided' cypher generation method has been configured
+    """
+    if model == "openai":
+        llm_topic_selection = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm_topic_selection = ChatGroq(model="deepseek-r1-distill-llama-70b")
+    print(f"FOUND LLM TOPIC SELECTION FOR THE CONCEPT SELECTION PROMPT : {llm_topic_selection}")
+    topic_selection_chain = CONCEPT_SELECTION_PROMPT | llm_topic_selection | StrOutputParser()
+    return topic_selection_chain
+def get_concepts(graph: Neo4jGraph):
+    concept_cypher = "MATCH (c:Concept) return c"
+    if isinstance(graph, Neo4jGraph):
+        concepts = graph.query(concept_cypher)
+    else:
+        user_input = input("Topics : ")
+        concepts = eval(user_input)
+    concepts_name = [concept['c']['name'] for concept in concepts]
+    return concepts_name
+def get_related_concepts(graph: Neo4jGraph, question: str):
+    concepts = get_concepts(graph)
+    llm = get_model()
+    print(f"this is the llm variable : {llm}")
+    def parse_answer(llm_answer : str):
+        try:
+            print(f"This the llm_answer : {llm_answer}")
+            return re.split("\n(?:\d)+\.\s", llm_answer.split("Concepts:")[1])[1:]
+        except:
+            return "No concept"
+    related_concepts_chain = RELEVANT_CONCEPTS_PROMPT | llm | StrOutputParser() | parse_answer
+    print(f"This is the question of the user : {question}")
+    print(f"This is the concepts of the user : {concepts}")
+#groq.APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `deepseek-r1-distill-llama-70b` in organization `org_01j6xywkndffv96m3wgh81jm49` on tokens per minute
+#  (TPM): Limit 5000, Requested 17099, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.',
+#  'type': 'tokens', 'code': 'rate_limit_exceeded'}}
+    try:
+        related_concepts_raw = related_concepts_chain.invoke({"user_query" : question, "concepts" : '\n'.join(concepts)})
+        print(f"related_concepts_raw : {related_concepts_raw}")
+    except Exception as e:
+        if e.status_code == 413:
+            msg = e.body["error"]["message"]
+            print(f"question is : {question}")
+            print(type(question))
+            error_question = ["user_query", question]
+            related_concepts_raw = error_concept_groq(msg,concepts,related_concepts_chain,error_question)
+            pass
+    # We clean up the list we received from the LLM in case there were some hallucinations
+    related_concepts_cleaned = []
+    for related_concept in related_concepts_raw:
+        # If the concept returned from the LLM is in the list we keep it
+        if related_concept in concepts:
+            related_concepts_cleaned.append(related_concept)
+        else:
+            # The LLM sometimes only forgets a few words from the concept name
+            # We check if the generated concept is a substring of an existing one and if it is the case add it to the list
+            for concept in concepts:
+                if related_concept in concept:
+                    related_concepts_cleaned.append(concept)
+                    break
+    # TODO : Add concepts found via similarity search
+    return related_concepts_cleaned
+def build_concept_string(graph: Neo4jGraph, concept_list: list[str]):
+    concept_string = ""
+    for concept in concept_list:
+        concept_description_query = f"""
+MATCH (c:Concept {{name: "{concept}" }}) RETURN c.description
+"""
+        concept_description = graph.query(concept_description_query)[0]['c.description']
+        concept_string += f"name: {concept}\ndescription: {concept_description}\n\n"
+    return concept_string
+def get_global_concepts(graph: Neo4jGraph):
+    concept_cypher = "MATCH (gc:GlobalConcept) return gc"
+    if isinstance(graph, Neo4jGraph):
+        concepts = graph.query(concept_cypher)
+    else:
+        user_input = input("Topics : ")
+        concepts = eval(user_input)
+    concepts_name = [concept['gc']['name'] for concept in concepts]
+    return concepts_name
+def generate_cypher(state: DocRetrieverState, config: ConfigSchema):
+    """
+    The node where the cypher is generated
+    """
+    graph = config["configurable"].get("graph")
+    question = state['query']
+    related_concepts = get_related_concepts(graph, question)
+    cyphers = []
+    if config["configurable"].get("cypher_gen_method") == 'auto':
+        cypher_gen_chain = get_cypher_gen_chain()
+        cyphers = cypher_gen_chain.invoke({
+            "schema": graph.schema,
+            "question": question,
+            "concepts": related_concepts
+        })
+    try :
+        if config["configurable"].get("cypher_gen_method") == 'guided':
+            concept_selection_chain = get_concept_selection_chain()
+            print(f"Concept selection chain is : {concept_selection_chain}")
+            selected_topic = concept_selection_chain.invoke({"question" : question, "concepts": get_concepts(graph)})
+            print(f"Selected topic are : {selected_topic}")
+    except Exception as e:
+        error_question = ["question", question]
+        selected_topic = error_concept_groq(e.body["error"]["message"],get_concepts(graph),concept_selection_chain,error_question)
+        pass
+        if config["configurable"].get("cypher_gen_method") == 'guided':
+            cyphers = [generate_cypher_from_topic(selected_topic, state['current_plan_step'])]
+            print(f"Cyphers are : {cyphers}")
+    if config["configurable"].get("validate_cypher"):
+        corrector_schema = [Schema(el["start"], el["type"], el["end"]) for el in graph.structured_schema.get("relationships")]
+        cypher_corrector = CypherQueryCorrector(corrector_schema)
+        cyphers = [cypher_corrector(cypher) for cypher in cyphers]
+    return {"cyphers" : cyphers}
+def generate_cypher_from_topic(selected_concept: str, plan_step: int):
+    """
+    Helper function used when the 'guided' cypher generation method has been configured
+    """
+    print(f"L.176 PLAN STEP : {plan_step}")
+    cypher_el = "(n) return n.title, n.description"
+    match plan_step:
+        case 0:
+            cypher_el = "(ts:TechnicalSpecification) RETURN ts.title, ts.scope, ts.description"
+        case 1:
+            cypher_el = "(rp:ResearchPaper) RETURN rp.title, rp.abstract"
+        case 2:
+            cypher_el = "(ki:KeyIssue) RETURN ki.description"
+    return f"MATCH (c:Concept {{name:'{selected_concept}'}})-[:RELATED_TO]-{cypher_el}"
+def get_docs(state:DocRetrieverState, config:ConfigSchema):
+    """
+    This node retrieves docs from the graph using the generated cypher
+    """
+    graph = config["configurable"].get("graph")
+    output = []
+    if graph is not None:
+        for cypher in state["cyphers"]:
+            try:
+                output = graph.query(cypher)
+                break
+            except Exception as e:
+                print("Failed to retrieve docs : {e}")
+    # Clean up the docs we received as there may be duplicates depending on the cypher query
+    all_docs = []
+    for doc in output:
+        unwinded_doc = {}
+        for key in doc:
+            if isinstance(doc[key], dict):
+                all_docs.append(doc[key])
+            else:
+                unwinded_doc.update({key: doc[key]})
+        if unwinded_doc:
+            all_docs.append(unwinded_doc)
+    filtered_docs = []
+    for doc in all_docs:
+        if doc not in filtered_docs:
+            filtered_docs.append(doc)
+    return {"docs": filtered_docs}
+# Data model
+class GradeDocumentsBinary(BaseModel):
+    """Binary score for relevance check on retrieved documents."""
+    binary_score: str = Field(
+        description="Documents are relevant to the question, 'yes' or 'no'"
+    )
+# LLM with function call
+# llm_grader_binary = ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0)
+def get_binary_grader(model="deepseek-r1-distill-llama-70b"):
+    """
+    Returns a binary grader to evaluate relevance of documents using specified model for generation
+    This is used when the 'binary' evaluation method has been configured
+    """
+    if model == "gpt-4o":
+        llm_grader_binary = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/", temperature=0)
+    else:
+        llm_grader_binary = ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0)
+    structured_llm_grader_binary = llm_grader_binary.with_structured_output(GradeDocumentsBinary)
+    retrieval_grader_binary = BINARY_GRADER_PROMPT | structured_llm_grader_binary
+    return retrieval_grader_binary
+class GradeDocumentsScore(BaseModel):
+    """Score for relevance check on retrieved documents."""
+    score: float = Field(
+        description="Documents are relevant to the question, score between 0 (completely irrelevant) and 1 (perfectly relevant)"
+    )
+def get_score_grader(model="deepseek-r1-distill-llama-70b"):
+    """
+    Returns a score grader to evaluate relevance of documents using specified model for generation
+    This is used when the 'score' evaluation method has been configured
+    """
+    if model == "gpt-4o":
+        llm_grader_score = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/", temperature=0)
+    else:
+        llm_grader_score = ChatGroq(model="deepseek-r1-distill-llama-70b", temperature = 0)
+    structured_llm_grader_score = llm_grader_score.with_structured_output(GradeDocumentsScore)
+    retrieval_grader_score = SCORE_GRADER_PROMPT | structured_llm_grader_score
+    return retrieval_grader_score
+def eval_doc(doc, query, method="binary", threshold=0.7, eval_model="deepseek-r1-distill-llama-70b"):
+    '''
+    doc : the document to evaluate
+    query : the query to which to doc shoud be relevant
+    method : "binary" or "score"
+    threshold : for "score" method, score above which a doc is considered relevant
+    '''
+    if method == "binary":
+        retrieval_grader_binary = get_binary_grader(model=eval_model)
+        return 1 if (retrieval_grader_binary.invoke({"question": query, "document":doc}).binary_score == 'yes') else 0
+    elif method == "score":
+        retrieval_grader_score = get_score_grader(model=eval_model)
+        score = retrieval_grader_score.invoke({"query": query, "document":doc}).score or None
+        if score is not None:
+            return score if score >= threshold else 0
+        else:
+            # Couldn't parse score, marking document as relevant by default
+            return 1
+    else:
+        raise ValueError("Invalid method")
+def eval_docs(state: DocRetrieverState, config: ConfigSchema):
+    """
+    This node performs evaluation of the retrieved docs and
+    """
+    eval_method =  config["configurable"].get("eval_method") or "binary"
+    MAX_DOCS = config["configurable"].get("max_docs") or 15
+    valid_doc_scores = []
+    for doc in sample(state["docs"], min(25, len(state["docs"]))):
+        score = eval_doc(
+                        doc=format_doc(doc),
+                        query=state["query"],
+                        method=eval_method,
+                        threshold=config["configurable"].get("eval_threshold") or 0.7,
+                        eval_model = config["configurable"].get("eval_model") or "deepseek-r1-distill-llama-70b"
+                        )
+        if score:
+            valid_doc_scores.append((doc, score))
+    if eval_method == 'score':
+        # Get at most MAX_DOCS items with the highest score if score method was used
+        valid_docs = sorted(valid_doc_scores, key=lambda x: x[1])
+        valid_docs = [valid_doc[0] for valid_doc in valid_docs[:MAX_DOCS]]
+    else:
+        # Get at mots MAX_DOCS items at random if binary method was used
+        shuffle(valid_doc_scores)
+        valid_docs = [valid_doc[0] for valid_doc in valid_doc_scores[:MAX_DOCS]]
+    return {"valid_docs": valid_docs + (state["valid_docs"] or [])}
+def build_data_retriever_graph(memory):
+    """
+    Builds the data_retriever graph
+    """
+    #with SqliteSaver.from_conn_string(":memory:") as memory :
+    graph_builder_doc_retriever = StateGraph(DocRetrieverState)
+    graph_builder_doc_retriever.add_node("generate_cypher", generate_cypher)
+    graph_builder_doc_retriever.add_node("get_docs", get_docs)
+    graph_builder_doc_retriever.add_node("eval_docs", eval_docs)
+    graph_builder_doc_retriever.add_edge("__start__", "generate_cypher")
+    graph_builder_doc_retriever.add_edge("generate_cypher", "get_docs")
+    graph_builder_doc_retriever.add_edge("get_docs", "eval_docs")
+    graph_builder_doc_retriever.add_edge("eval_docs", "__end__")
+    graph_doc_retriever = graph_builder_doc_retriever.compile(checkpointer=memory)
+    return graph_doc_retriever
+def error_concept_groq(msg,concepts,groq,question):
+    try:
+        start = msg.find("Requested") + len("Requested ")
+        end = msg.find(",", start)
+        rate_limit = int(msg[start:end])
+        related_concepts = []
+        i = 0
+        start = 0
+        end = len(concepts) // (rate_limit // 5000 + (1 if rate_limit%4500 != 0 else 0))
+        while (i < rate_limit // 5000):
+            smaller_concepts =  concepts[start:end]
+            start = end
+            end = end + len(concepts) // (rate_limit//5000 + (1 if rate_limit%4500 != 0 else 0))
+            res = groq.invoke({question[0] : question[1], "concepts" : '\n'.join(smaller_concepts)})
+            for r in res:
+                related_concepts.append(r)
+            i+=1
+        return related_concepts
+    except Exception as e:
+        if e.status_code == 419:
+            time.sleep(65)
+            error_concept_groq(msg,concepts,groq,question)

ki_gen/planner.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import os
+import re
+from typing import Annotated
+from typing_extensions import TypedDict
+from langchain_groq import ChatGroq
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_community.graphs import Neo4jGraph
+from langgraph.graph import StateGraph
+from langgraph.graph import add_messages
+from ki_gen.prompts import PLAN_GEN_PROMPT, PLAN_MODIFICATION_PROMPT
+from ki_gen.data_retriever import build_data_retriever_graph
+from ki_gen.data_processor import build_data_processor_graph
+from ki_gen.utils import ConfigSchema, State, HumanValidationState, DocProcessorState, DocRetrieverState
+from langgraph.checkpoint.sqlite import SqliteSaver
+##########################################################################
+######                       NODES DEFINITION                       ######
+##########################################################################
+def validate_node(state: State):
+    """
+    This node inserts the plan validation prompt.
+    """
+    prompt = """System : You only need to focus on Key Issues, no need to focus on solutions or stakeholders yet and your plan should be concise.
+If needed, give me an updated plan to follow this instruction. If your plan already follows the instruction just say "My plan is correct"."""
+    output = HumanMessage(content=prompt)
+    return {"messages" : [output]}
+def error_chatbot_groq(error, model_name, query): # Pass model_name instead of llm_groq object
+    # Switch API key logic...
+    if os.environ["GROQ_API_KEY"] == os.getenv("groq_api_key"):
+        os.environ["GROQ_API_KEY"] = os.getenv("groq_api_key2")
+    elif os.environ["GROQ_API_KEY"] == os.getenv("groq_api_key2"):
+        os.environ["GROQ_API_KEY"] = os.getenv("groq_api_key3")
+    else:
+        os.environ["GROQ_API_KEY"] = os.getenv("groq_api_key")
+    # Re-initialize the model *after* switching the key
+    try:
+        # Use the model_name passed in
+        llm_groq_retry = ChatGroq(model=model_name)
+        # Pass the original query messages
+        return {"messages" : [llm_groq_retry.invoke(query)]}
+    except Exception as retry_error:
+        # Handle potential error during retry
+        print(f"Error during retry: {retry_error}")
+        # Decide what to return or raise here
+        return {"messages": [SystemMessage(content=f"Failed to process after retry: {retry_error}")]}
+# Wrappers to call LLMs on the state messsages field
+def chatbot_llama(state: State):
+    try:
+        llm_llama = ChatGroq(model="llama3-70b-8192")
+        return {"messages" : [llm_llama.invoke(state["messages"])]}
+    except Exception as error:
+        error_chatbot_groq(error,llm_llama,state["messages"])
+def chatbot_mixtral(state: State):
+    print(state)
+    llm_mixtral = ChatGroq(model="deepseek-r1-distill-llama-70b")
+    print(llm_mixtral)
+    return {"messages" : [llm_mixtral.invoke(state["messages"])]}
+    # except Exception as error:
+        # error_chatbot_groq(error,llm_mixtral,state["messages"])
+def chatbot_openai(state: State):
+    llm_openai = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    return {"messages" : [llm_openai.invoke(state["messages"])]}
+chatbots = {"gpt-4o" : chatbot_openai,
+            "deepseek-r1-distill-llama-70b" : chatbot_mixtral,
+            "llama3-70b-8192" : chatbot_llama
+            }
+def parse_plan(state: State):
+    """
+    This node parses the generated plan and writes in the 'store_plan' field of the state
+    """
+    plan = state["messages"][-3].content
+    store_plan = re.split("\d\.", plan.split("Plan:\n")[1])[1:]
+    try:
+        store_plan[len(store_plan) - 1] = store_plan[len(store_plan) - 1].split("<END_OF_PLAN>")[0]
+    except Exception as e:
+        print(f"Error while removing <END_OF_PLAN> : {e}")
+    return {"store_plan" : store_plan}
+def detail_step(state: State, config: ConfigSchema):
+    """
+    This node updates the value of the 'current_plan_step' field and defines the query to be used for the data_retriever.
+    """
+    print("test")
+    print(state)
+    if 'current_plan_step' in state.keys():
+        print("all good chief")
+    else:
+        state["current_plan_step"] = None
+    current_plan_step = state["current_plan_step"] + 1 if state["current_plan_step"] is not None else 0 # We just began a new step so we will increase current_plan_step at the end
+    if config["configurable"].get("use_detailed_query"):
+        prompt = HumanMessage(f"""Specify what additional information you need to proceed with the next step of your plan :
+                            Step {current_plan_step + 1} : {state['store_plan'][current_plan_step]}""")
+        query = get_detailed_query(context = state["messages"] + [prompt], model=config["configurable"].get("main_llm"))
+        return {"messages" : [prompt, query], "current_plan_step": current_plan_step, 'query' : query}
+    return {"current_plan_step": current_plan_step, 'query' : state["store_plan"][current_plan_step], "valid_docs" : []}
+def get_detailed_query(context : list, model : str = "deepseek-r1-distill-llama-70b"):
+    """
+    Simple helper function for the detail_step node
+    """
+    if model == 'gpt-4o':
+        llm = ChatOpenAI(model=model, base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm = ChatGroq(model=model)
+    return llm.invoke(context)
+def concatenate_data(state: State):
+    """
+    This node concatenates all the data that was processed by the data_processor and inserts it in the state's messages
+    """
+    prompt = f"""#########TECHNICAL INFORMATION ############
+{str(state["valid_docs"])}
+########END OF TECHNICAL INFORMATION#######
+Using the information provided above, proceed with step {state['current_plan_step'] + 1} of your plan :
+    {state['store_plan'][state['current_plan_step']]}
+"""
+    return {"messages": [HumanMessage(content=prompt)]}
+def human_validation(state: HumanValidationState) -> HumanValidationState:
+    """
+    Dummy node to interrupt before
+    """
+    return {'process_steps' : []}
+def generate_ki(state: State):
+    """
+    This node inserts the prompt to begin Key Issues generation
+    """
+    print(f"THIS IS THE STATE FOR CURRENT PLAN STEP IN GENERATE_KI : {state}")
+    prompt = f"""Using the information provided above, proceed with step 4 of your plan to provide the user with NEW and INNOVATIVE Key Issues :
+    {state['store_plan'][state['current_plan_step'] + 1]}"""
+    return {"messages" : [HumanMessage(content=prompt)]}
+def detail_ki(state: State):
+    """
+    This node inserts the last prompt to detail the generated Key Issues
+    """
+    prompt = f"""Using the information provided above, proceed with step 5 of your plan to provide the user with NEW and INNOVATIVE Key Issues :
+    {state['store_plan'][state['current_plan_step'] + 2]}"""
+    return {"messages" : [HumanMessage(content=prompt)]}
+##########################################################################
+######                    CONDITIONAL EDGE FUNCTIONS                ######
+##########################################################################
+def validate_plan(state: State):
+    """
+    Whether to regenerate the plan or to parse it
+    """
+    if "messages" in state and "My plan is correct" in state["messages"][-1].content:
+        return "parse"
+    return "validate"
+def next_plan_step(state: State, config: ConfigSchema):
+    """
+    Proceed to next plan step (either generate KI or retrieve more data)
+    """
+    if (state["current_plan_step"] == 2) and (config["configurable"].get('plan_method') == "modification"):
+        return "generate_key_issues"
+    if state["current_plan_step"] == len(state["store_plan"]) - 1:
+        return "generate_key_issues"
+    else:
+        return "detail_step"
+def detail_or_data_retriever(state: State, config: ConfigSchema):
+    """
+    Detail the query to use for data retrieval or not
+    """
+    if config["configurable"].get("use_detailed_query"):
+        return "chatbot_detail"
+    else:
+        return "data_retriever"
+def retrieve_or_process(state: State):
+    """
+    Process the retrieved docs or keep retrieving
+    """
+    if state['human_validated']:
+        return "process"
+    return "retrieve"
+    # while True:
+    #     user_input = input(f"{len(state['valid_docs'])} were retreived. Do you want more documents (y/[n]) : ")
+    #     if user_input.lower() == "y":
+    #         return "retrieve"
+    #     if not user_input or user_input.lower() == "n":
+    #         return "process"
+    #     print("Please answer with 'y' or 'n'.\n")
+def build_planner_graph(memory, config):
+    """
+    Builds the planner graph
+    """
+    graph_builder = StateGraph(State)
+    graph_doc_retriever = build_data_retriever_graph(memory)
+    graph_doc_processor = build_data_processor_graph(memory)
+    graph_builder.add_node("chatbot_planner", chatbots[config["main_llm"]])
+    graph_builder.add_node("validate", validate_node)
+    graph_builder.add_node("chatbot_detail", chatbot_llama)
+    graph_builder.add_node("parse", parse_plan)
+    graph_builder.add_node("detail_step", detail_step)
+    graph_builder.add_node("data_retriever", graph_doc_retriever, input=DocRetrieverState)
+    graph_builder.add_node("human_validation", human_validation)
+    graph_builder.add_node("data_processor", graph_doc_processor, input=DocProcessorState)
+    graph_builder.add_node("concatenate_data", concatenate_data)
+    graph_builder.add_node("chatbot_exec_step", chatbots[config["main_llm"]])
+    graph_builder.add_node("generate_ki", generate_ki)
+    graph_builder.add_node("chatbot_ki", chatbots[config["main_llm"]])
+    graph_builder.add_node("detail_ki", detail_ki)
+    graph_builder.add_node("chatbot_final", chatbots[config["main_llm"]])
+    graph_builder.add_edge("validate", "chatbot_planner")
+    graph_builder.add_edge("parse", "detail_step")
+    # graph_builder.add_edge("detail_step", "chatbot2")
+    graph_builder.add_edge("chatbot_detail", "data_retriever")
+    graph_builder.add_edge("data_retriever", "human_validation")
+    graph_builder.add_edge("data_processor", "concatenate_data")
+    graph_builder.add_edge("concatenate_data", "chatbot_exec_step")
+    graph_builder.add_edge("generate_ki", "chatbot_ki")
+    graph_builder.add_edge("chatbot_ki", "detail_ki")
+    graph_builder.add_edge("detail_ki", "chatbot_final")
+    graph_builder.add_edge("chatbot_final", "__end__")
+    graph_builder.add_conditional_edges(
+        "detail_step",
+        detail_or_data_retriever,
+        {"chatbot_detail": "chatbot_detail", "data_retriever": "data_retriever"}
+    )
+    graph_builder.add_conditional_edges(
+        "human_validation",
+        retrieve_or_process,
+        {"retrieve" : "data_retriever", "process" : "data_processor"}
+    )
+    graph_builder.add_conditional_edges(
+        "chatbot_planner",
+        validate_plan,
+        {"parse" : "parse", "validate": "validate"}
+    )
+    graph_builder.add_conditional_edges(
+        "chatbot_exec_step",
+        next_plan_step,
+        {"generate_key_issues" : "generate_ki", "detail_step": "detail_step"}
+    )
+    graph_builder.set_entry_point("chatbot_planner")
+    graph = graph_builder.compile(
+        checkpointer=memory,
+        interrupt_after=["parse", "chatbot_exec_step", "chatbot_final", "data_retriever"],
+    )
+    return graph

ki_gen/prompts.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from langchain_core.prompts.prompt import PromptTemplate
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import SystemMessage, HumanMessage
+from ki_gen.utils import ConfigSchema
+CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
+Instructions:
+Use only the provided relationship types and properties in the schema.
+Do not use any other relationship types or properties that are not provided.
+Schema:
+{schema}
+Concepts:
+{concepts}
+Concept names can ONLY be selected from the above list
+Note: Do not include any explanations or apologies in your responses.
+Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
+Do not include any text except the generated Cypher statement.
+The question is:
+{question}"""
+CYPHER_GENERATION_PROMPT = PromptTemplate(
+    input_variables=["schema", "question", "concepts"], template=CYPHER_GENERATION_TEMPLATE
+)
+CYPHER_QA_TEMPLATE = """You are an assistant that helps to form nice and human understandable answers.
+The information part contains the provided information that you must use to construct an answer.
+The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
+Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
+Here is an example:
+Question: Which managers own Neo4j stocks?
+Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
+Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
+Follow this example when generating answers.
+If the provided information is empty, say that you don't know the answer.
+Information:
+{context}
+Question: {question}
+Helpful Answer:"""
+CYPHER_QA_PROMPT = PromptTemplate(
+    input_variables=["context", "question"], template=CYPHER_QA_TEMPLATE
+)
+PLAN_GEN_PROMPT = """System : You are a standardization expert working for 3GPP. You are given a specific technical requirement regarding the deployment of 5G services. Your goal is to specify NEW and INNOVATIVE Key Issues that could occur while trying to fulfill this requirement
+System : Let's first understand the problem and devise a plan to solve the problem.
+Output the plan starting with the header 'Plan:' and then followed by a numbered list of steps.
+Make the plan the minimum number of steps required to accurately provide the user with NEW and INNOVATIVE Key Issues related to the technical requirement.
+At the end of your plan, say '<END_OF_PLAN>'"""
+PLAN_MODIFICATION_PROMPT = """You are a standardization expert working for 3GPP. You are given a specific technical requirement regarding the deployment of 5G services. Your goal is to specify NEW and INNOVATIVE Key Issues that could occur while trying to fulfill this requirement.
+To achieve this goal we are going to follow this generic plan :
+###PLAN TEMPLATE###
+Plan:
+1. **Understanding the Problem**: Gather information from existing specifications and standards to thoroughly understand the technical requirement. This should help you understand the key aspects of the problem.
+2. **Gather information about latest innovations** : Gather information about the latest innovations related to the problem by looking at the most relevant research papers and list the sources.
+3. **Identifying NEW and INNOVATIVE Key Issues**: Based on the understanding of the problem, identify new and innovative key issues that could occur while trying to fulfill this requirement. Descripbe them in simple technical english. These key issues should be relevant, significant, and not yet addressed by existing solutions.
+4. **Develop Detailed Descriptions for Each Key Issue**: For each identified key issue, provide a detailed description in simple technical english, including the specific challenges and areas requiring further study.
+<END_OF_PLAN>
+###END OF PLAN TEMPLATE###
+Let's and devise a plan to solve the problem by adapting the PLAN TEMPLATE.
+Output the plan starting with the header 'Plan:' and then followed by a numbered list of steps.
+Make the plan the minimum number of steps required to accurately provide the user with NEW and INNOVATIVE Key Issues related to the technical requirement.
+At the end of your plan, say '<END_OF_PLAN>' """
+PLAN_GEN_PROMPT = PLAN_MODIFICATION_PROMPT
+CONCEPT_SELECTION_TEMPLATE = """Task: Select the most relevant topic to the user question
+Instructions:
+Select the most relevant Concept to the user's question.
+Concepts can ONLY be selected from the list below.
+Concepts:
+{concepts}
+Note: Do not include any explanations or apologies in your responses.
+Do not include any text except the selected concept.
+The question is:
+{question}"""
+CONCEPT_SELECTION_PROMPT = PromptTemplate(
+    input_variables=["concepts", "question"], template=CONCEPT_SELECTION_TEMPLATE
+)
+RELEVANT_CONCEPTS_TEMPLATE = """
+## CONCEPTS ##
+{concepts}
+## END OF CONCEPTS ##
+Select the 20 most relevant concepts to the user query.
+Output your answer as a numbered list preceeded with the header 'Concepts:'.
+User query :
+{user_query}
+"""
+RELEVANT_CONCEPTS_PROMPT = ChatPromptTemplate.from_messages([
+    ("human", RELEVANT_CONCEPTS_TEMPLATE)
+])
+SUMMARIZER_TEMPLATE = """You are a 3GPP standardization expert.
+Summarize the provided document in simple technical English for other experts in the field.
+Document:
+{document}"""
+SUMMARIZER_PROMPT = ChatPromptTemplate.from_messages([
+        ("system", SUMMARIZER_TEMPLATE)
+    ])
+BINARY_GRADER_TEMPLATE = """You are a grader assessing relevance of a retrieved document to a user question. \n
+    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
+    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
+    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
+BINARY_GRADER_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        ("system", BINARY_GRADER_TEMPLATE),
+        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
+    ]
+)
+SCORE_GRADER_TEMPLATE = """Grasp and understand both the query and the document before score generation.
+Then, based on your understanding and analysis quantify the relevance between the document and the query.
+Give the rationale before answering.
+Ouput your answer as a score ranging between 0 (irrelevant document) and 1 (completely relevant document)"""
+SCORE_GRADER_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        ("system", SCORE_GRADER_TEMPLATE),
+        ("human", "Passage: \n\n {document} \n\n User query: {query}")
+    ]
+)
+def get_initial_prompt(config: ConfigSchema, user_query : str):
+    if config["configurable"].get("plan_method") == "generation":
+        prompt = PLAN_GEN_PROMPT
+    elif config["configurable"].get("plan_method") == "modification":
+        prompt = PLAN_MODIFICATION_PROMPT
+    else:
+        raise ValueError("Incorrect plan_method, should be 'generation' or 'modification'")
+    user_input = user_query or input("User :")
+    return {"messages" : [SystemMessage(content=prompt), HumanMessage(content=user_input)]}

ki_gen/utils.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import getpass
+import html
+from typing import Annotated, Union
+from typing_extensions import TypedDict
+from langchain_community.graphs import Neo4jGraph
+from langchain_groq import ChatGroq
+from langchain_openai import ChatOpenAI
+from langgraph.checkpoint.sqlite import SqliteSaver
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.checkpoint import base
+from langgraph.graph import add_messages
+memory = MemorySaver()
+def format_df(df):
+    """
+    Used to display the generated plan in a nice format
+    Returns html code in a string
+    """
+    def format_cell(cell):
+        if isinstance(cell, str):
+            # Encode special characters, but preserve line breaks
+            return html.escape(cell).replace('\n', '<br>')
+        return cell
+    # Convert the DataFrame to HTML with custom CSS
+    formatted_df = df.map(format_cell)
+    html_table = formatted_df.to_html(escape=False, index=False)
+    # Add custom CSS to allow multiple lines and scrolling in cells
+    css = """
+    <style>
+        table {
+            border-collapse: collapse;
+            width: 100%;
+        }
+        th, td {
+            border: 1px solid black;
+            padding: 8px;
+            text-align: left;
+            vertical-align: top;
+            white-space: pre-wrap;
+            max-width: 300px;
+            max-height: 100px;
+            overflow-y: auto;
+        }
+        th {
+            background-color: #f2f2f2;
+        }
+    </style>
+    """
+    return css + html_table
+def format_doc(doc: dict) -> str :
+    formatted_string = ""
+    for key in doc:
+        formatted_string += f"**{key}**: {doc[key]}\n"
+    return formatted_string
+def _set_env(var: str, value: str = None):
+    if not os.environ.get(var):
+        if value:
+            os.environ[var] = value
+        else:
+            os.environ[var] = getpass.getpass(f"{var}: ")
+def init_app(openai_key : str = None, groq_key : str = None, langsmith_key : str = None):
+    """
+    Initialize app with user api keys and sets up proxy settings
+    """
+    _set_env("GROQ_API_KEY", value=os.getenv("groq_api_key"))
+    _set_env("LANGSMITH_API_KEY", value=os.getenv("langsmith_api_key"))
+    _set_env("OPENAI_API_KEY", value=os.getenv("openai_api_key"))
+    os.environ["LANGSMITH_TRACING_V2"] = "true"
+    os.environ["LANGCHAIN_PROJECT"] = "3GPP Test"
+def clear_memory(memory, thread_id: str = "") -> None:
+    """
+    Clears checkpointer state for a given thread_id, broken for now
+    TODO : fix this
+    """
+    memory = MemorySaver()
+    #checkpoint = base.empty_checkpoint()
+    #memory.put(config={"configurable": {"thread_id": thread_id}}, checkpoint=checkpoint, metadata={})
+def get_model(model : str = "deepseek-r1-distill-llama-70b"):
+    """
+    Wrapper to return the correct llm object depending on the 'model' param
+    """
+    if model == "gpt-4o":
+        llm = ChatOpenAI(model=model, base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm = ChatGroq(model=model)
+    return llm
+class ConfigSchema(TypedDict):
+    graph: Neo4jGraph
+    plan_method: str
+    use_detailed_query: bool
+class State(TypedDict):
+    messages : Annotated[list, add_messages]
+    store_plan : list[str]
+    current_plan_step : int
+    valid_docs : list[str]
+class DocRetrieverState(TypedDict):
+    messages: Annotated[list, add_messages]
+    query: str
+    docs: list[dict]
+    cyphers: list[str]
+    current_plan_step : int
+    valid_docs: list[Union[str, dict]]
+class HumanValidationState(TypedDict):
+    human_validated : bool
+    process_steps : list[str]
+def update_doc_history(left : list | None, right : list | None) -> list:
+    """
+    Reducer for the 'docs_in_processing' field.
+    Doesn't work currently because of bad handlinf of duplicates
+    TODO : make this work (reference : https://langchain-ai.github.io/langgraph/how-tos/subgraph/#custom-reducer-functions-to-manage-state)
+    """
+    if not left:
+        # This shouldn't happen
+        left = [[]]
+    if not right:
+        right = []
+    for i in range(len(right)):
+        left[i].append(right[i])
+    return left
+class DocProcessorState(TypedDict):
+    valid_docs : list[Union[str, dict]]
+    docs_in_processing : list
+    process_steps : list[Union[str,dict]]
+    current_process_step : int