Spaces:

heymenn
/

bjhk

Sleeping

App Files Files Community

heymenn commited on Oct 28, 2024

Commit

6aaddef

verified ·

1 Parent(s): 228c70f

Upload 15 files

Browse files

Files changed (15) hide show

app.py +555 -0
doc_explorer/embeddings_full.npy +3 -0
doc_explorer/explorer.py +335 -0
doc_explorer/exported_docs/.gitkeep +1 -0
doc_explorer/vectorstore.py +151 -0
flagged/log.csv +39 -0
images/flowchart_graphrag.png +0 -0
images/flowchart_graphrag_dark.png +0 -0
images/flowchart_graphrag_final.png +0 -0
images/graph_png.png +0 -0
ki_gen/data_processor.py +183 -0
ki_gen/data_retriever.py +351 -0
ki_gen/planner.py +253 -0
ki_gen/prompts.py +155 -0
ki_gen/utils.py +155 -0

app.py ADDED Viewed

	@@ -0,0 +1,555 @@

+import gradio as gr
+from langchain_community.graphs import Neo4jGraph
+import pandas as pd
+import json
+from ki_gen.planner import build_planner_graph
+from ki_gen.utils import clear_memory, init_app, format_df, memory
+from ki_gen.prompts import get_initial_prompt
+MAX_PROCESSING_STEPS = 10
+print(f"WHEREVER YOU ARE THIS IS THE MEMORY INSTANCE !!!! : {type(memory)} !!!!")
+def start_inference(data):
+    """
+    Starts plan generation with user_query as input which gets displayed after
+    """
+    config = data[config_state]
+    init_app(
+        openai_key=data[openai_api_key],
+        groq_key=data[groq_api_key],
+        langsmith_key=data[langsmith_api_key]
+    )
+    clear_memory(memory, config["configurable"].get("thread_id"))
+    graph = build_planner_graph(memory, config["configurable"])
+    with open("images/graph_png.png", "wb") as f:
+        f.write(graph.get_graph(xray=1).draw_mermaid_png())
+    print("here !")
+    for event in graph.stream(get_initial_prompt(config, data[user_query]), config, stream_mode="values"):
+        if "messages" in event:
+            event["messages"][-1].pretty_print()
+    state = graph.get_state(config)
+    steps = [i for i in range(1,len(state.values['store_plan'])+1)]
+    df = pd.DataFrame({'Plan steps': steps, 'Description': state.values['store_plan']})
+    return [df, graph]
+def update_display(df):
+    """
+    Displays the df after it has been generated
+    """
+    formatted_html = format_df(df)
+    return {
+        plan_display : gr.update(visible=True, value = formatted_html),
+        select_step_to_modify : gr.update(visible=True, value=0),
+        enter_new_step : gr.update(visible=True),
+        submit_new_step : gr.update(visible=True),
+        continue_inference_btn : gr.update(visible=True)
+    }
+def format_docs(docs: list[dict]):
+    formatted_results = ""
+    for i, doc in enumerate(docs):
+        formatted_results += f"\n### Document {i}\n"
+        for key in doc:
+            formatted_results += f"**{key}**: {doc[key]}\n"
+    return formatted_results
+def continue_inference(data):
+    """
+    Proceeds to next plan step
+    """
+    graph = data[graph_state]
+    config = data[config_state]
+    for event in graph.stream(None, config, stream_mode="values"):
+        if "messages" in event:
+            event["messages"][-1].pretty_print()
+    snapshot = graph.get_state(config)
+    print(f"DEBUG INFO : next : {snapshot.next}")
+    print(f"DEBUG INFO ++ L.75: {snapshot}")
+    if snapshot.next and snapshot.next[0] == "human_validation":
+        return {
+            continue_inference_btn : gr.update(visible=False),
+            graph_state : graph,
+            retrieve_more_docs_btn : gr.update(visible=True),
+            continue_to_processing_btn : gr.update(visible=True),
+            human_validation_title : gr.update(visible=True, value=f"**{len(snapshot.values['valid_docs'])} documents retrieved.** Retrieve more or continue ?"),
+            retrieved_docs_state : snapshot.values['valid_docs']
+        }
+    return {
+        plan_result : snapshot.values["messages"][-1].content,
+        graph_state : graph,
+        continue_inference_btn : gr.update(visible=False)
+    }
+def continue_to_processing():
+    """
+    Continue to doc processing configuration
+    """
+    return {
+        retrieve_more_docs_btn : gr.update(visible=False),
+        continue_to_processing_btn : gr.update(visible=False),
+        human_validation_title : gr.update(visible=False),
+        process_data_btn : gr.update(visible=True),
+        process_steps_nb : gr.update(visible=True),
+        process_steps_title : gr.update(visible=True)
+    }
+def retrieve_more_docs(data):
+    """
+    Restart doc retrieval
+    For now we simply regenerate the cypher, it may be different because temperature != 0
+    """
+    graph = data[graph_state]
+    config = data[config_state]
+    graph.update_state(config, {'human_validated' : False}, as_node="human_validation")
+    for event in graph.stream(None, config, stream_mode="values"):
+        if "messages" in event:
+            event["messages"][-1].pretty_print()
+    snapshot = graph.get_state(config)
+    print(f"DEBUG INFO : next : {snapshot.next}")
+    print(f"DEBUG INFO ++ L.121: {snapshot}")
+    return {
+        graph_state : graph,
+        human_validation_title : gr.update(visible=True, value=f"**{len(snapshot.values['valid_docs'])} documents retrieved.** Retrieve more or continue ?"),
+        retrieved_docs_display : format_docs(snapshot.values['valid_docs'])
+    }
+def execute_processing(*args):
+    """
+    Execute doc processing
+    Args are passed as a list and not a dict for syntax convenience
+    """
+    graph = args[-2]
+    config = args[-1]
+    nb_process_steps = args[-3]
+    process_steps = []
+    for i in range (nb_process_steps):
+        if args[i] == "custom":
+            process_steps.append({"prompt" : args[nb_process_steps + i], "context" : args[2*nb_process_steps + i], "processing_model" : args[3*nb_process_steps + i]})
+        else:
+            process_steps.append(args[i])
+    graph.update_state(config, {'human_validated' : True, 'process_steps' : process_steps}, as_node="human_validation")
+    for event in graph.stream(None, config, stream_mode="values"):
+        if "messages" in event:
+            event["messages"][-1].pretty_print()
+    snapshot = graph.get_state(config)
+    print(f"DEBUG INFO : next : {snapshot.next}")
+    print(f"DEBUG INFO ++ L.153: {snapshot}")
+    return {
+        plan_result : snapshot.values["messages"][-1].content,
+        processed_docs_state : snapshot.values["valid_docs"],
+        graph_state : graph,
+        continue_inference_btn : gr.update(visible=True),
+        process_steps_nb : gr.update(value=0, visible=False),
+        process_steps_title : gr.update(visible=False),
+        process_data_btn : gr.update(visible=False),
+    }
+def update_config_display():
+    """
+    Called after loading the config.json file
+    TODO : allow the user to specify a path to the config file
+    """
+    with open("config.json", "r") as config_file:
+        config = json.load(config_file)
+    return {
+        main_llm : config["main_llm"],
+        plan_method : config["plan_method"],
+        use_detailed_query : config["use_detailed_query"],
+        cypher_gen_method : config["cypher_gen_method"],
+        validate_cypher : config["validate_cypher"],
+        summarization_model : config["summarize_model"],
+        eval_method : config["eval_method"],
+        eval_threshold : config["eval_threshold"],
+        max_docs : config["max_docs"],
+        compression_method : config["compression_method"],
+        compress_rate : config["compress_rate"],
+        force_tokens : config["force_tokens"],
+        eval_model : config["eval_model"],
+        srv_addr : config["graph"]["address"],
+        srv_usr : config["graph"]["username"],
+        srv_pwd : config["graph"]["password"],
+        openai_api_key : config["openai_api_key"],
+        groq_api_key : config["groq_api_key"],
+        langsmith_api_key : config["langsmith_api_key"]
+    }
+def build_config(data):
+    """
+    Build the config variable using the values inputted by the user
+    """
+    config = {}
+    config["main_llm"] = data[main_llm]
+    config["plan_method"] = data[plan_method]
+    config["use_detailed_query"] = data[use_detailed_query]
+    config["cypher_gen_method"] = data[cypher_gen_method]
+    config["validate_cypher"] = data[validate_cypher]
+    config["summarize_model"] = data[summarization_model]
+    config["eval_method"] = data[eval_method]
+    config["eval_threshold"] = data[eval_threshold]
+    config["max_docs"] = data[max_docs]
+    config["compression_method"] = data[compression_method]
+    config["compress_rate"] = data[compress_rate]
+    config["force_tokens"] = data[force_tokens]
+    config["eval_model"] = data[eval_model]
+    config["thread_id"] = "3"
+    try:
+        neograph = Neo4jGraph(url=data[srv_addr], username=data[srv_usr], password=data[srv_pwd])
+        config["graph"] = neograph
+    except Exception as e:
+        raise gr.Error(f"Error when configuring the neograph server : {e}", duration=5)
+    gr.Info("Succesfully updated configuration !", duration=5)
+    return {"configurable" : config}
+with gr.Blocks() as demo:
+    with gr.Tab("Config"):
+        ### The config tab
+        gr.Markdown("## Config options setup")
+        gr.Markdown("### API Keys")
+        with gr.Row():
+            openai_api_key = gr.Textbox(
+                label="OpenAI API Key",
+                type="password"
+            )
+            groq_api_key = gr.Textbox(
+                label="Groq API Key",
+                type='password'
+            )
+            langsmith_api_key = gr.Textbox(
+                label="LangSmith API Key",
+                type="password"
+            )
+        gr.Markdown('### Planner options')
+        with gr.Row():
+                main_llm = gr.Dropdown(
+                    choices=["gpt-4o", "claude-3-5-sonnet", "mixtral-8x7b-32768"],
+                    label="Main LLM",
+                    info="Choose the LLM which will perform the generation",
+                    value="gpt-4o"
+                )
+                with gr.Column(scale=1, min_width=600):
+                    plan_method = gr.Dropdown(
+                        choices=["generation", "modification"],
+                        label="Planning method",
+                        info="Choose how the main LLM will generate its plan",
+                        value="modification"
+                    )
+                    use_detailed_query = gr.Checkbox(
+                        label="Detail each plan step",
+                        info="Detail each plan step before passing it for data query"
+                    )
+        gr.Markdown("### Data query options")
+        # The options for the data processor
+        # TODO : remove the options for summarize and compress and let the user choose them when specifying processing steps
+        # (similarly to what is done for custom processing step)
+        with gr.Row():
+            with gr.Column(scale=1, min_width=300):
+            # Neo4j Server parameters
+                srv_addr = gr.Textbox(
+                    label="Neo4j server address",
+                    placeholder="localhost:7687"
+                )
+                srv_usr = gr.Textbox(
+                    label="Neo4j username",
+                    placeholder="neo4j"
+                )
+                srv_pwd = gr.Textbox(
+                    label="Neo4j password",
+                    placeholder="<Password>"
+                )
+            with gr.Column(scale=1, min_width=300):
+                cypher_gen_method = gr.Dropdown(
+                    choices=["auto", "guided"],
+                    label="Cypher generation method",
+                )
+                validate_cypher = gr.Checkbox(
+                    label="Validate cypher using graph Schema"
+                )
+                summarization_model = gr.Dropdown(
+                    choices=["gpt-4o", "claude-3-5-sonnet", "mixtral-8x7b-32768", "llama3-70b-8192"],
+                    label="Summarization LLM",
+                    info="Choose the LLM which will perform the summaries"
+                )
+            with gr.Column(scale=1, min_width=300):
+                eval_method = gr.Dropdown(
+                    choices=["binary", "score"],
+                    label="Retrieved docs evaluation method",
+                    info="Evaluation method of retrieved docs"
+                )
+                eval_model = gr.Dropdown(
+                    choices = ["gpt-4o", "mixtral-8x7b-32768"],
+                    label = "Evaluation model",
+                    info = "The LLM to use to evaluate the relevance of retrieved docs",
+                    value = "mixtral-8x7b-32768"
+                )
+                eval_threshold = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    value=0.7,
+                    label="Eval threshold",
+                    info="Score above which a doc is considered relevant",
+                    step=0.01,
+                    visible=False
+                )
+                def eval_method_changed(selection):
+                    if selection == "score":
+                        return gr.update(visible=True)
+                    return gr.update(visible=False)
+                eval_method.change(eval_method_changed, inputs=eval_method, outputs=eval_threshold)
+                max_docs= gr.Slider(
+                    minimum=0,
+                    maximum = 30,
+                    value = 15,
+                    label="Max docs",
+                    info="Maximum number of docs to be retrieved at each query",
+                    step=0.01
+                )
+            with gr.Column(scale=1, min_width=300):
+                compression_method = gr.Dropdown(
+                    choices=["llm_lingua2", "llm_lingua"],
+                    label="Compression method",
+                    value="llm_lingua2"
+                )
+                with gr.Row():
+                    # Add compression rate configuration with a gr.slider
+                    compress_rate = gr.Slider(
+                        minimum = 0,
+                        maximum = 1,
+                        value   = 0.33,
+                        label="Compression rate",
+                        info="Compression rate",
+                        step    = 0.01
+                    )
+                    # Add gr.CheckboxGroup to choose force_tokens
+                    force_tokens = gr.CheckboxGroup(
+                        choices=['\n', '?', '.', '!', ','],
+                        value=[],
+                        label="Force tokens",
+                        info="Tokens to keep during compression",
+                    )
+        with gr.Row():
+            btn_update_config = gr.Button(value="Update config")
+            load_config_json = gr.Button(value="Load config from JSON")
+        with gr.Row():
+            debug_info = gr.Button(value="Print debug info")
+        config_state = gr.State(value={})
+        btn_update_config.click(
+            build_config,
+            inputs={main_llm, plan_method, use_detailed_query, srv_addr, srv_pwd, srv_usr, compression_method, eval_model, \
+                    compress_rate, force_tokens, cypher_gen_method, validate_cypher, summarization_model, eval_method, eval_threshold, max_docs},
+            outputs=config_state
+        )
+        load_config_json.click(
+            update_config_display,
+            outputs={main_llm, plan_method, use_detailed_query, cypher_gen_method, validate_cypher, summarization_model, eval_method, eval_threshold, \
+                     max_docs, compress_rate, compression_method, force_tokens, eval_model, srv_addr, srv_usr, srv_pwd, openai_api_key, langsmith_api_key, groq_api_key}
+        ).then(
+            build_config,
+            inputs={main_llm, plan_method, use_detailed_query, srv_addr, srv_pwd, srv_usr, compression_method, eval_model, \
+                    compress_rate, force_tokens, cypher_gen_method, validate_cypher, summarization_model, eval_method, eval_threshold, max_docs},
+            outputs=config_state
+        )
+        # Print config variable in the terminal
+        debug_info.click(lambda x : print(x), inputs=config_state)
+    with gr.Tab("Inference"):
+        ### Inference tab
+        graph_state = gr.State()
+        user_query = gr.Textbox(label = "Your query")
+        launch_inference = gr.Button(value="Generate plan")
+        with gr.Row():
+            dataframe_plan = gr.Dataframe(visible = False)
+            plan_display = gr.HTML(visible = False, label="Generated plan")
+            with gr.Column():
+                # Lets the user modify steps of the plan. Underlying logic not implemented yet
+                # TODO : implement this
+                with gr.Row():
+                    select_step_to_modify = gr.Number(visible= False, label="Select a plan step to modify", value=0)
+                    submit_new_step = gr.Button(visible = False, value="Submit new step")
+                enter_new_step = gr.Textbox(visible=False, label="Modify the plan step")
+        with gr.Row():
+            human_validation_title = gr.Markdown(visible=False)
+            retrieve_more_docs_btn = gr.Button(value="Retrieve more docs", visible=False)
+            continue_to_processing_btn = gr.Button(value="Proceed to data processing", visible=False)
+        with gr.Row():
+            with gr.Column():
+                process_steps_title = gr.Markdown("#### Data processing steps", visible=False)
+                process_steps_nb = gr.Number(label="Number of processing steps", value = 0, precision=0, step = 1, visible=False)
+            def get_process_step_names():
+                return ["summarize", "compress", "custom"]
+        # The gr.render decorator allows the code inside the following function to be rerun everytime the 'inputs' variable is modified
+        # /!\ All event listeners that use variables defined inside a gr.render function must be defined inside that same function
+        # ref : https://www.gradio.app/docs/gradio/render
+        @gr.render(inputs=process_steps_nb)
+        def processing(nb):
+            with gr.Row():
+                process_step_names = get_process_step_names()
+                dropdowns = []
+                textboxes = []
+                usable_elements = []
+                processing_models = []
+                for i in range(nb):
+                    with gr.Column():
+                        dropdown = gr.Dropdown(key = f"d{i}", choices=process_step_names, label=f"Data processing step {i+1}")
+                        dropdowns.append(dropdown)
+                        textbox = gr.Textbox(
+                            key = f"t{i}",
+                            value="",
+                            placeholder="Your custom prompt",
+                            visible=True, min_width=300)
+                        textboxes.append(textbox)
+                        usable_element = gr.Dropdown(
+                            key = f"u{i}",
+                            choices = [(j) for j in range(i+1)],
+                            label="Elements passed to the LLM for this process step",
+                            multiselect=True,
+                        )
+                        usable_elements.append(usable_element)
+                        processing_model = gr.Dropdown(
+                            key = f"m{i}",
+                            label="The LLM that will execute this step",
+                            visible=True,
+                            choices=["gpt-4o", "mixtral-8x7b-32768", "llama3-70b-8182"]
+                        )
+                        processing_models.append(processing_model)
+                        dropdown.change(
+                            fn=lambda process_name : [gr.update(visible=(process_name=="custom")), gr.update(visible=(process_name=='custom')), gr.update(visible=(process_name=='custom'))],
+                            inputs=dropdown,
+                            outputs=[textbox, usable_element, processing_model]
+                        )
+                process_data_btn.click(
+                    execute_processing,
+                    inputs= dropdowns + textboxes + usable_elements + processing_models + [process_steps_nb, graph_state, config_state],
+                    outputs={plan_result, processed_docs_state, graph_state, continue_inference_btn, process_steps_nb, process_steps_title, process_data_btn}
+                )
+        process_data_btn = gr.Button(value="Process retrieved docs", visible=False)
+        continue_inference_btn = gr.Button(value="Proceed to next plan step", visible=False)
+        plan_result = gr.Markdown(visible = True, label="Result of last plan step")
+    with gr.Tab("Retrieved Docs"):
+        retrieved_docs_state = gr.State([])
+        with gr.Row():
+            gr.Markdown("# Retrieved Docs")
+            retrieved_docs_btn = gr.Button("Display retrieved docs")
+        retrieved_docs_display = gr.Markdown()
+        processed_docs_state = gr.State([])
+        with gr.Row():
+            gr.Markdown("# Processed Docs")
+            processed_docs_btn = gr.Button("Display processed docs")
+        processed_docs_display = gr.Markdown()
+    continue_inference_btn.click(
+        continue_inference,
+        inputs={graph_state, config_state},
+        outputs={continue_inference_btn, graph_state, retrieve_more_docs_btn, continue_to_processing_btn, human_validation_title, plan_result, retrieved_docs_state}
+    )
+    launch_inference.click(
+        start_inference,
+        inputs={config_state, user_query, openai_api_key, groq_api_key, langsmith_api_key},
+        outputs=[dataframe_plan, graph_state]
+    ).then(
+        update_display,
+        inputs=dataframe_plan,
+        outputs={plan_display, select_step_to_modify, enter_new_step, submit_new_step, continue_inference_btn}
+    )
+    retrieve_more_docs_btn.click(
+        retrieve_more_docs,
+        inputs={graph_state, config_state},
+        outputs={graph_state, human_validation_title, retrieved_docs_display}
+    )
+    continue_to_processing_btn.click(
+        continue_to_processing,
+        outputs={retrieve_more_docs_btn, continue_to_processing_btn, human_validation_title, process_data_btn, process_steps_nb, process_steps_title}
+    )
+    retrieved_docs_btn.click(
+        fn=lambda docs : format_docs(docs),
+        inputs=retrieved_docs_state,
+        outputs=retrieved_docs_display
+    )
+    processed_docs_btn.click(
+        fn=lambda docs : format_docs(docs),
+        inputs=processed_docs_state,
+        outputs=processed_docs_display
+    )
+    test_process_steps = gr.Button(value="Test process steps")
+    test_process_steps.click(
+        lambda : [gr.update(visible = True), gr.update(visible=True)],
+        outputs=[process_steps_nb, process_steps_title]
+    )
+demo.launch()

doc_explorer/embeddings_full.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf8ae23f82d734adab5810858bb55c2f13edb06795637b6fd85ada823d722527
+size 55693440

doc_explorer/explorer.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import gradio as gr
+from vectorstore import FAISSVectorStore
+from langchain_community.graphs import Neo4jGraph
+import os
+import json
+import html
+import pandas as pd
+import time
+time.sleep(30)
+os.environ["http_proxy"] = "185.46.212.98:80"
+os.environ["https_proxy"] = "185.46.212.98:80"
+os.environ["NO_PROXY"] = "localhost"
+neo4j_graph = Neo4jGraph(
+    url=os.getenv("NEO4J_URI", "bolt://localhost:7999"),
+    username=os.getenv("NEO4J_USERNAME", "neo4j"),
+    password=os.getenv("NEO4J_PASSWORD", "graph_test")
+)
+# Requires ~1GB RAM
+vector_store = FAISSVectorStore(model_name='Alibaba-NLP/gte-large-en-v1.5', dimension=1024, trust_remote_code=True, embedding_file="/usr/src/app/doc_explorer/embeddings_full.npy")
+# Get document types from Neo4j database
+def get_document_types():
+    query = """
+    MATCH (n)
+    RETURN DISTINCT labels(n) AS document_type
+    """
+    result = neo4j_graph.query(query)
+    return [row["document_type"][0] for row in result]
+def search(query, doc_types, use_mmr, lambda_param, top_k):
+    results, node_ids = vector_store.similarity_search(
+        query,
+        k=top_k,
+        use_mmr=use_mmr,
+        lambda_param=lambda_param if use_mmr else None,
+        doc_types=doc_types,
+        neo4j_graph=neo4j_graph
+    )
+    formatted_results = []
+    formatted_choices = []
+    for i, result in enumerate(results):
+        formatted_results.append(f"{i}. {result['document']} (Score: {result['score']:.4f})")
+        formatted_choices.append(f"{i}. {str(result['document'])[:100]} (Score: {result['score']:.4f})")
+    return formatted_results, gr.update(choices=formatted_choices, value=[]), node_ids
+def get_docs_from_ids(graph_data : dict):
+    node_ids = [node["id"] for node in graph_data["nodes"]]
+    print(node_ids)
+    query = """
+    MATCH (n)
+    WHERE n.id IN $node_ids
+    RETURN n.id AS id, n AS doc, labels(n) AS category
+    """
+    return neo4j_graph.query(query, {"node_ids" : node_ids}), graph_data["edges"]
+def get_neighbors_and_graph_data(selected_documents, node_ids, graph_data):
+    if not selected_documents:
+        return "No documents selected.", json.dumps(graph_data), graph_data
+    selected_indices = [int(doc.split('.')[0]) - 1 for doc in selected_documents]
+    selected_node_ids = [node_ids[i] for i in selected_indices]
+    query = """
+    MATCH (n)-[r]-(neighbor)
+    WHERE n.id IN $node_ids
+    RETURN n.id AS source_id, n AS source_text, labels(n) AS source_type,
+           neighbor.id AS neighbor_id, neighbor AS neighbor_text,
+           labels(neighbor) AS neighbor_type, type(r) AS relationship_type
+    """
+    results = neo4j_graph.query(query, {"node_ids": selected_node_ids})
+    if not results:
+        return "No neighbors found for the selected documents.", "[]"
+    neighbor_info = {}
+    node_set = set([node["id"] for node in graph_data["nodes"]])
+    for row in results:
+        source_id = row['source_id']
+        if source_id not in neighbor_info:
+            neighbor_info[source_id] = {
+                'source_type': row["source_type"][0],
+                'source_text': row['source_text'],
+                'neighbors': []
+            }
+            if source_id not in node_set:
+                graph_data["nodes"].append({
+                    "id": source_id,
+                    "label": str(row['source_text'])[:30] + "...",
+                    "group": row['source_type'][0],
+                    "title": f"<div class='node-tooltip'><h3>{row['source_type'][0]}</h3><p>{row['source_text']}</p></div>",
+                })
+                node_set.add(source_id)
+        neighbor_info[source_id]['neighbors'].append(
+            f"[{row['relationship_type']}] [{row['neighbor_type'][0]}] {str(row['neighbor_text'])[:200]}"
+        )
+        if row['neighbor_id'] not in node_set:
+            graph_data["nodes"].append({
+                "id": row['neighbor_id'],
+                "label": str(row['neighbor_text'])[:30] + "...",
+                "group": row['neighbor_type'][0],
+                "title": f"<div class='node-tooltip'><h3>{row['neighbor_type'][0]}</h3><p>{html.escape(str(row['neighbor_text']))}</p></div>",
+            })
+            node_set.add(row['neighbor_id'])
+        edge = {
+            "from": source_id,
+            "to" : row['neighbor_id'],
+            "label": row['relationship_type']
+        }
+        if edge not in graph_data['edges']:
+            graph_data['edges'].append(edge)
+    output = []
+    for source_id, info in neighbor_info.items():
+        output.append(f"Neighbors for: [{info['source_type']}] {str(info['source_text'])[:100]}")
+        output.extend(info['neighbors'])
+        output.append("\n\n")  # Empty line for separation
+    formatted_choices = []
+    node_ids = []
+    for i, node in enumerate(graph_data['nodes']):
+        formatted_choices.append(f"{i+1}. {str(node['label'])})")
+        node_ids.append(node['id'])
+    return "\n".join(output), json.dumps(graph_data), graph_data, gr.update(choices=formatted_choices, value=[]), node_ids
+def save_docs_to_excel(exported_docs : list[dict], exported_relationships : list[dict]):
+    cleaned_docs = [dict(doc['doc'], **{'id': doc['id'], 'category': doc['category'][0], "relationships" : ""}) for doc in exported_docs]
+    for relationship in exported_relationships:
+        for doc in cleaned_docs:
+            if doc['id'] == relationship['from']:
+                doc["relationships"] += f"[{relationship['label']}] {relationship['to']}\n"
+    df = pd.DataFrame(cleaned_docs)
+    df.to_excel("doc_explorer/exported_docs/docs.xlsx")
+    return gr.update(value="doc_explorer/exported_docs/docs.xlsx", visible=True)
+# JavaScript code for graph visualization
+js_code = """
+function(graph_data_str) {
+    if (!graph_data_str) return;
+    const container = document.getElementById('graph-container');
+    container.innerHTML = '';
+    let data;
+    try {
+        data = JSON.parse(graph_data_str);
+    } catch (error) {
+        console.error("Failed to parse graph data:", error);
+        container.innerHTML = "Error: Failed to load graph data.";
+        return;
+    }
+    data.nodes.forEach(node => {
+        const div = document.createElement('div');
+        div.innerHTML = node.title;
+        node.title = div.firstChild;
+    });
+    const nodes = new vis.DataSet(data.nodes);
+    const edges = new vis.DataSet(data.edges);
+    const options = {
+        nodes: {
+            shape: 'dot',
+            size: 16,
+            font: {
+                size: 12,
+                color: '#000000'
+            },
+            borderWidth: 2
+        },
+        edges: {
+            width: 1,
+            font: {
+                size: 10,
+                align: 'middle'
+            },
+            color: { color: '#7A7A7A', hover: '#2B7CE9' }
+        },
+        physics: {
+            forceAtlas2Based: {
+                gravitationalConstant: -26,
+                centralGravity: 0.005,
+                springLength: 230,
+                springConstant: 0.18
+            },
+            maxVelocity: 146,
+            solver: 'forceAtlas2Based',
+            timestep: 0.35,
+            stabilization: { iterations: 150 }
+        },
+        interaction: {
+            hover: true,
+            tooltipDelay: 200
+        }
+    };
+    const network = new vis.Network(container, { nodes: nodes, edges: edges }, options);
+}
+"""
+head = """
+    <script type="text/javascript" src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
+    <link href="https://unpkg.com/vis-network/styles/vis-network.min.css" rel="stylesheet" type="text/css" />
+"""
+custom_css = """
+#graph-container {
+    border: 1px solid #ddd;
+    border-radius: 4px;
+}
+.vis-tooltip {
+    font-family: Arial, sans-serif;
+    padding: 10px;
+    border-radius: 4px;
+    background-color: rgba(255, 255, 255, 0.9);
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    max-width: 300px;
+    color: #333;
+    word-wrap: break-word;
+    overflow-wrap: break-word;
+}
+.node-tooltip {
+    width: 100%;
+}
+.node-tooltip h3 {
+    margin: 0 0 5px 0;
+    font-size: 14px;
+    color: #333;
+}
+.node-tooltip p {
+    margin: 0;
+    font-size: 12px;
+    color: #666;
+    white-space: normal;
+}
+"""
+with gr.Blocks(head=head, css=custom_css) as demo:
+    with gr.Tab("Search"):
+        gr.Markdown("# Document Search Engine")
+        gr.Markdown("Enter a query to search for similar documents. You can filter by document type and use MMR for diverse results.")
+        with gr.Row():
+            with gr.Column(scale=3):
+                query_input = gr.Textbox(label="Enter your query")
+                doc_type_input = gr.Dropdown(choices=get_document_types(), label="Select document type", multiselect=True)
+            with gr.Column(scale=2):
+                mmr_input = gr.Checkbox(label="Use MMR for diverse results")
+                lambda_input = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.5, label="Lambda parameter (MMR diversity)", visible=False)
+                top_k_input = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of results")
+        search_button = gr.Button("Search")
+        results_output = gr.Textbox(label="Search Results")
+        selected_documents = gr.Dropdown(label="Select documents to view their neighbors", choices=[], multiselect=True, interactive=True)
+        with gr.Row():
+            neighbor_search_button = gr.Button("Find Neighbors")
+            send_to_export = gr.Button("Send docs to export Tab")
+        neighbors_output = gr.Textbox(label="Document Neighbors")
+        graph_data_state = gr.State({"nodes": [], "edges": []})
+        graph_data_str = gr.Textbox(visible=False)
+        graph_container = gr.HTML('<div id="graph-container" style="height: 600px;"> Hey ! </div>')
+        node_ids = gr.State([])
+        exported_docs = gr.State([])
+        exported_relationships = gr.State([])
+        def update_lambda_visibility(use_mmr):
+            return gr.update(visible=use_mmr)
+        mmr_input.change(fn=update_lambda_visibility, inputs=mmr_input, outputs=lambda_input)
+        search_button.click(
+            fn=search,
+            inputs=[query_input, doc_type_input, mmr_input, lambda_input, top_k_input],
+            outputs=[results_output, selected_documents, node_ids]
+        )
+        neighbor_search_button.click(
+            fn=get_neighbors_and_graph_data,
+            inputs=[selected_documents, node_ids, graph_data_state],
+            outputs=[neighbors_output, graph_data_str, graph_data_state, selected_documents, node_ids]
+        ).then(
+            fn=None,
+            inputs=graph_data_str,
+            outputs=None,
+            js=js_code,
+        )
+        send_to_export.click(
+            fn=get_docs_from_ids,
+            inputs=graph_data_state,
+            outputs=[exported_docs, exported_relationships]
+        )
+        # gr.Examples(
+        #     examples=[
+        #         ["What is machine learning?", "Article", True, 0.5, 5],
+        #         ["How to implement a neural network?", "Tutorial", False, 0.5, 3],
+        #         ["Latest advancements in NLP", "Research Paper", True, 0.7, 10]
+        #     ],
+        #     inputs=[query_input, doc_type_input, mmr_input, lambda_input, top_k_input]
+        # )
+    with gr.Tab("Export"):
+        with gr.Row():
+            exported_docs_btn = gr.Button("Display exported docs")
+            exported_excel_btn = gr.Button("Export to excel")
+            exported_excel = gr.File(visible=False)
+        exported_docs_display = gr.Markdown(visible=False)
+        exported_docs_btn.click(
+            fn= lambda docs: gr.update(value='\n\n'.join([f"[{doc['category'][0]}]\n{doc['doc']}\n\n" for doc in docs]), visible=True),
+            inputs=exported_docs,
+            outputs=exported_docs_display
+        )
+        exported_excel_btn.click(
+            fn=save_docs_to_excel,
+            inputs=[exported_docs, exported_relationships],
+            outputs=exported_excel
+        )
+demo.launch()

doc_explorer/exported_docs/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+ a

doc_explorer/vectorstore.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from typing import List, Optional, Tuple
+from langchain_community.graphs import Neo4jGraph
+import pickle
+class FAISSVectorStore:
+    def __init__(self, model_name: str = None, dimension: int = 384, embedding_file: str = None, trust_remote_code = False):
+        self.model = SentenceTransformer(model_name, trust_remote_code=trust_remote_code) if model_name is not None else None
+        self.index = faiss.IndexFlatIP(dimension)
+        self.dimension = dimension
+        if embedding_file:
+            self.load_embeddings(embedding_file)
+    def load_embeddings(self, file_path: str):
+        if file_path.endswith('.pkl'):
+            with open(file_path, 'rb') as f:
+                embeddings = pickle.load(f)
+        elif file_path.endswith('.npy'):
+            embeddings = np.load(file_path)
+        else:
+            raise ValueError("Unsupported file format. Use .pkl or .npy")
+        self.add_embeddings(embeddings)
+    def add_embeddings(self, embeddings: np.ndarray):
+        faiss.normalize_L2(embeddings)
+        self.index.add(embeddings)
+    def similarity_search(self, query: str, k: int = 5, use_mmr: bool = False, lambda_param: float = 0.5, doc_types: list[str] = None, neo4j_graph: Neo4jGraph = None):
+        query_vector = self.model.encode([query])
+        faiss.normalize_L2(query_vector)
+        if use_mmr:
+            return self._mmr_search(query_vector, k, lambda_param, neo4j_graph, doc_types)
+        else:
+            return self._simple_search(query_vector, k, neo4j_graph, doc_types)
+    def _simple_search(self, query_vector: np.ndarray, k: int, neo4j_graph: Neo4jGraph, doc_types : list[str] = None) -> List[dict]:
+        distances, indices = self.index.search(query_vector, k)
+        results = []
+        results_idx = []
+        for i, idx in enumerate(indices[0]):
+            document = self._get_text_by_index(neo4j_graph, idx, doc_types)
+            if document is not None:
+                results.append({
+                    'document': document,
+                    'score': distances[0][i]
+                })
+                results_idx.append(idx)
+        return results, results_idx
+    def _mmr_search(self, query_vector: np.ndarray, k: int, lambda_param: float, neo4j_graph: Neo4jGraph, doc_types: list[str] = None) -> Tuple[List[dict], List[int]]:
+        initial_k = min(k * 2, self.index.ntotal)
+        distances, indices = self.index.search(query_vector, initial_k)
+        # Reconstruct embeddings for the initial results
+        initial_embeddings = self._reconstruct_embeddings(indices[0])
+        selected_indices = []
+        unselected_indices = list(range(len(indices[0])))
+        for _ in range(min(k, len(indices[0]))):
+            mmr_scores = []
+            for i in unselected_indices:
+                if not selected_indices:
+                    mmr_scores.append((i, distances[0][i]))
+                else:
+                    embedding_i = initial_embeddings[i]
+                    redundancy = max(self._cosine_similarity(embedding_i, initial_embeddings[j]) for j in selected_indices)
+                    mmr_scores.append((i, lambda_param * distances[0][i] - (1 - lambda_param) * redundancy))
+            selected_idx = max(mmr_scores, key=lambda x: x[1])[0]
+            selected_indices.append(selected_idx)
+            unselected_indices.remove(selected_idx)
+        results = []
+        results_idx = []
+        for idx in selected_indices:
+            document = self._get_text_by_index(neo4j_graph, indices[0][idx], doc_types)
+            if document is not None:
+                results.append({
+                    'document': document,
+                    'score': distances[0][idx]
+                })
+                results_idx.append(idx)
+        return results, results_idx
+    def _reconstruct_embeddings(self, indices: np.ndarray) -> np.ndarray:
+        return self.index.reconstruct_batch(indices)
+    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
+        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+    def _get_text_by_index(self, neo4j_graph, index, doc_types):
+        if doc_types is None:
+            query = f"""
+            MATCH (n)
+            WHERE n.id = $index
+            RETURN n AS document, labels(n) AS document_type, n.id AS node_id
+            """
+            result = neo4j_graph.query(query, {"index": index})
+        else:
+            for doc_type in doc_types:
+                query = f"""
+                MATCH (n:{doc_type})
+                WHERE n.id = $index
+                RETURN n AS document, labels(n) AS document_type, n.id AS node_id
+                """
+                result = neo4j_graph.query(query, {"index": index})
+                if result:
+                    break
+        if result:
+            return f"[{result[0]['document_type'][0]}] {result[0]['document']}"
+        return None
+# Example usage
+if __name__ == "__main__":
+    # Initialize the vector store with embedding file
+    vector_store = FAISSVectorStore(dimension=384, embedding_file="path/to/your/embeddings.pkl")  # or .npy file
+    # Initialize Neo4jGraph (replace with your actual Neo4j connection details)
+    neo4j_graph = Neo4jGraph(
+        url="bolt://localhost:7687",
+        username="neo4j",
+        password="password"
+    )
+    # Perform a similarity search with and without MMR
+    query = "How to start a long journey"
+    results_simple = vector_store.similarity_search(query, k=5, use_mmr=False, neo4j_graph=neo4j_graph)
+    results_mmr = vector_store.similarity_search(query, k=5, use_mmr=True, lambda_param=0.5, neo4j_graph=neo4j_graph)
+    # Print the results
+    print(f"Top 5 similar texts for query: '{query}' (without MMR)")
+    for i, result in enumerate(results_simple, 1):
+        print(f"{i}. Text: {result['text']}")
+        print(f"   Score: {result['score']}")
+        print()
+    print(f"Top 5 similar texts for query: '{query}' (with MMR)")
+    for i, result in enumerate(results_mmr, 1):
+        print(f"{i}. Text: {result['text']}")
+        print(f"   Score: {result['score']}")
+        print()

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,39 @@

+output,flag,username,timestamp
+"'
+    <style>
+        table {
+            border-collapse: collapse;
+            width: 100%;
+        }
+        th, td {
+            border: 1px solid black;
+            padding: 8px;
+            text-align: left;
+            vertical-align: top;
+            white-space: pre-wrap;
+            max-width: 300px;
+            max-height: 100px;
+            overflow-y: auto;
+        }
+        th {
+            background-color: #f2f2f2;
+        }
+    </style>
+    <table border=""1"" class=""dataframe"">
+  <thead>
+    <tr style=""text-align: right;"">
+      <th>Column1</th>
+      <th>Column2</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Line 1\nLine 2\nLine 3</td>
+      <td>Short text</td>
+    </tr>
+    <tr>
+      <td>Single line</td>
+      <td>Very long text that goes on and on and might need scrolling in the cell</td>
+    </tr>
+  </tbody>
+</table>",,,2024-07-29 15:18:50.387842

images/flowchart_graphrag.png ADDED Viewed

images/flowchart_graphrag_dark.png ADDED Viewed

images/flowchart_graphrag_final.png ADDED Viewed

images/graph_png.png ADDED Viewed

ki_gen/data_processor.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#!/usr/bin/env python
+# coding: utf-8
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_groq import ChatGroq
+from langgraph.graph import StateGraph
+from llmlingua import PromptCompressor
+from ki_gen.utils import ConfigSchema, DocProcessorState, get_model, format_doc
+# compressed_prompt = llm_lingua.compress_prompt(prompt, instruction="", question="", target_token=200)
+## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory.
+## Before that, you need to pip install optimum auto-gptq
+# llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
+# Requires ~2GB of RAM
+def get_llm_lingua(compress_method:str = "llm_lingua2"):
+    # Requires ~2GB memory
+    if compress_method == "llm_lingua2":
+        llm_lingua2 = PromptCompressor(
+            model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
+            use_llmlingua2=True,
+            device_map="cpu"
+        )
+        return llm_lingua2
+    # Requires ~8GB memory
+    elif compress_method == "llm_lingua":
+        llm_lingua = PromptCompressor(
+            model_name="microsoft/phi-2",
+            device_map="cpu"
+        )
+        return llm_lingua
+    raise ValueError("Incorrect compression method, should be 'llm_lingua' or 'llm_lingua2'")
+def compress(state: DocProcessorState, config: ConfigSchema):
+    """
+    This node compresses last processing result for each doc using llm_lingua
+    """
+    doc_process_histories = state["docs_in_processing"]
+    llm_lingua = get_llm_lingua(config["configurable"].get("compression_method") or "llm_lingua2")
+    for doc_process_history in doc_process_histories:
+        doc_process_history.append(llm_lingua.compress_prompt(
+            doc = str(doc_process_history[-1]),
+            rate=config["configurable"].get("compress_rate") or 0.33,
+            force_tokens=config["configurable"].get("force_tokens") or ['\n', '?', '.', '!', ',']
+            )["compressed_prompt"]
+        )
+    return {"docs_in_processing": doc_process_histories, "current_process_step" : state["current_process_step"] + 1}
+def summarize_docs(state: DocProcessorState, config: ConfigSchema):
+    """
+    This node summarizes all docs in state["valid_docs"]
+    """
+    prompt = """You are a 3GPP standardization expert.
+Summarize the provided document in simple technical English for other experts in the field.
+Document:
+{document}"""
+    sysmsg = ChatPromptTemplate.from_messages([
+        ("system", prompt)
+    ])
+    model = config["configurable"].get("summarize_model") or "mixtral-8x7b-32768"
+    doc_process_histories = state["docs_in_processing"]
+    if model == "gpt-4o":
+        llm_summarize = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm_summarize = ChatGroq(model=model)
+    summarize_chain = sysmsg | llm_summarize | StrOutputParser()
+    for doc_process_history in doc_process_histories:
+        doc_process_history.append(summarize_chain.invoke({"document" : str(doc_process_history[-1])}))
+    return {"docs_in_processing": doc_process_histories, "current_process_step": state["current_process_step"] + 1}
+def custom_process(state: DocProcessorState):
+    """
+    Custom processing step, params are stored in a dict in state["process_steps"][state["current_process_step"]]
+    processing_model : the LLM which will perform the processing
+    context : the previous processing results to send as context to the LLM
+    user_prompt : the prompt/task which will be appended to the context before sending to the LLM
+    """
+    processing_params = state["process_steps"][state["current_process_step"]]
+    model = processing_params.get("processing_model") or "mixtral-8x7b-32768"
+    user_prompt = processing_params["prompt"]
+    context = processing_params.get("context") or [0]
+    doc_process_histories = state["docs_in_processing"]
+    if not isinstance(context, list):
+        context = [context]
+    processing_chain = get_model(model=model) | StrOutputParser()
+    for doc_process_history in doc_process_histories:
+        context_str = ""
+        for i, context_element in enumerate(context):
+            context_str += f"### TECHNICAL INFORMATION {i+1} \n {doc_process_history[context_element]}\n\n"
+        doc_process_history.append(processing_chain.invoke(context_str + user_prompt))
+    return {"docs_in_processing" : doc_process_histories, "current_process_step" : state["current_process_step"] + 1}
+def final(state: DocProcessorState):
+    """
+    A node to store the final results of processing in the 'valid_docs' field
+    """
+    return {"valid_docs" : [doc_process_history[-1] for doc_process_history in state["docs_in_processing"]]}
+# TODO : remove this node and use conditional entry point instead
+def get_process_steps(state: DocProcessorState, config: ConfigSchema):
+    """
+    Dummy node
+    """
+    # if not process_steps:
+    #     process_steps = eval(input("Enter processing steps: "))
+    return {"current_process_step": 0, "docs_in_processing" : [[format_doc(doc)] for doc in state["valid_docs"]]}
+def next_processor_step(state: DocProcessorState):
+    """
+    Conditional edge function to go to next processing step
+    """
+    process_steps = state["process_steps"]
+    if state["current_process_step"] < len(process_steps):
+        step = process_steps[state["current_process_step"]]
+        if isinstance(step, dict):
+            step = "custom"
+    else:
+        step = "final"
+    return step
+def build_data_processor_graph(memory):
+    """
+    Builds the data processor graph
+    """
+    graph_builder_doc_processor = StateGraph(DocProcessorState)
+    graph_builder_doc_processor.add_node("get_process_steps", get_process_steps)
+    graph_builder_doc_processor.add_node("summarize", summarize_docs)
+    graph_builder_doc_processor.add_node("compress", compress)
+    graph_builder_doc_processor.add_node("custom", custom_process)
+    graph_builder_doc_processor.add_node("final", final)
+    graph_builder_doc_processor.add_edge("__start__", "get_process_steps")
+    graph_builder_doc_processor.add_conditional_edges(
+        "get_process_steps",
+        next_processor_step,
+        {"compress" : "compress", "final": "final", "summarize": "summarize", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_conditional_edges(
+        "summarize",
+        next_processor_step,
+        {"compress" : "compress", "final": "final", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_conditional_edges(
+        "compress",
+        next_processor_step,
+        {"summarize" : "summarize", "final": "final", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_conditional_edges(
+        "custom",
+        next_processor_step,
+        {"summarize" : "summarize", "final": "final", "compress" : "compress", "custom" : "custom"}
+    )
+    graph_builder_doc_processor.add_edge("final", "__end__")
+    graph_doc_processor = graph_builder_doc_processor.compile(checkpointer=memory)
+    return graph_doc_processor

ki_gen/data_retriever.py ADDED Viewed

	@@ -0,0 +1,351 @@

+#!/usr/bin/env python
+# coding: utf-8
+import re
+from random import shuffle, sample
+from langchain_groq import ChatGroq
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage
+from langchain_community.graphs import Neo4jGraph
+from langchain_community.chains.graph_qa.cypher_utils import CypherQueryCorrector, Schema
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_groq import ChatGroq
+from langgraph.graph import StateGraph
+from llmlingua import PromptCompressor
+from ki_gen.prompts import (
+    CYPHER_GENERATION_PROMPT,
+    CONCEPT_SELECTION_PROMPT,
+    BINARY_GRADER_PROMPT,
+    SCORE_GRADER_PROMPT,
+    RELEVANT_CONCEPTS_PROMPT,
+)
+from ki_gen.utils import ConfigSchema, DocRetrieverState, get_model, format_doc
+def extract_cypher(text: str) -> str:
+    """Extract Cypher code from a text.
+    Args:
+        text: Text to extract Cypher code from.
+    Returns:
+        Cypher code extracted from the text.
+    """
+    # The pattern to find Cypher code enclosed in triple backticks
+    pattern_1 = r"```cypher\n(.*?)```"
+    pattern_2 = r"```\n(.*?)```"
+    # Find all matches in the input text
+    matches_1 = re.findall(pattern_1, text, re.DOTALL)
+    matches_2 = re.findall(pattern_2, text, re.DOTALL)
+    return [
+        matches_1[0] if matches_1 else text,
+        matches_2[0] if matches_2 else text,
+        text
+    ]
+def get_cypher_gen_chain(model: str = "openai"):
+    """
+    Returns cypher gen chain using specified model for generation
+    This is used when the 'auto' cypher generation method has been configured
+    """
+    if model=="openai":
+        llm_cypher_gen = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm_cypher_gen = ChatGroq(model = "mixtral-8x7b-32768")
+    cypher_gen_chain = CYPHER_GENERATION_PROMPT | llm_cypher_gen | StrOutputParser() | extract_cypher
+    return cypher_gen_chain
+def get_concept_selection_chain(model: str = "openai"):
+    """
+    Returns a chain to select the most relevant topic using specified model for generation.
+    This is used when the 'guided' cypher generation method has been configured
+    """
+    if model == "openai":
+        llm_topic_selection = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm_topic_selection = ChatGroq(model="llama3-70b-8192", max_tokens=8192)
+    print(f"FOUND LLM TOPIC SELECTION FOR THE CONCEPT SELECTION PROMPT : {llm_topic_selection}")
+    topic_selection_chain = CONCEPT_SELECTION_PROMPT | llm_topic_selection | StrOutputParser()
+    return topic_selection_chain
+def get_concepts(graph: Neo4jGraph):
+    concept_cypher = "MATCH (c:Concept) return c"
+    if isinstance(graph, Neo4jGraph):
+        concepts = graph.query(concept_cypher)
+    else:
+        user_input = input("Topics : ")
+        concepts = eval(user_input)
+    concepts_name = [concept['c']['name'] for concept in concepts]
+    return concepts_name
+def get_related_concepts(graph: Neo4jGraph, question: str):
+    concepts = get_concepts(graph)
+    llm = get_model(model='gpt-4o')
+    print(f"this is the llm variable : {llm}")
+    def parse_answer(llm_answer : str):
+        print(f"This the llm_answer : {llm_answer}")
+        return re.split("\n(?:\d)+\.\s", llm_answer.split("Concepts:")[1])[1:]
+    related_concepts_chain = RELEVANT_CONCEPTS_PROMPT | llm | StrOutputParser() | parse_answer
+    related_concepts_raw =  related_concepts_chain.invoke({"user_query" : question, "concepts" : '\n'.join(concepts)})
+    # We clean up the list we received from the LLM in case there were some hallucinations
+    related_concepts_cleaned = []
+    for related_concept in related_concepts_raw:
+        # If the concept returned from the LLM is in the list we keep it
+        if related_concept in concepts:
+            related_concepts_cleaned.append(related_concept)
+        else:
+            # The LLM sometimes only forgets a few words from the concept name
+            # We check if the generated concept is a substring of an existing one and if it is the case add it to the list
+            for concept in concepts:
+                if related_concept in concept:
+                    related_concepts_cleaned.append(concept)
+                    break
+    # TODO : Add concepts found via similarity search
+    return related_concepts_cleaned
+def build_concept_string(graph: Neo4jGraph, concept_list: list[str]):
+    concept_string = ""
+    for concept in concept_list:
+        concept_description_query = f"""
+MATCH (c:Concept {{name: "{concept}" }}) RETURN c.description
+"""
+        concept_description = graph.query(concept_description_query)[0]['c.description']
+        concept_string += f"name: {concept}\ndescription: {concept_description}\n\n"
+    return concept_string
+def get_global_concepts(graph: Neo4jGraph):
+    concept_cypher = "MATCH (gc:GlobalConcept) return gc"
+    if isinstance(graph, Neo4jGraph):
+        concepts = graph.query(concept_cypher)
+    else:
+        user_input = input("Topics : ")
+        concepts = eval(user_input)
+    concepts_name = [concept['gc']['name'] for concept in concepts]
+    return concepts_name
+def generate_cypher(state: DocRetrieverState, config: ConfigSchema):
+    """
+    The node where the cypher is generated
+    """
+    graph = config["configurable"].get("graph")
+    question = state['query']
+    related_concepts = get_related_concepts(graph, question)
+    cyphers = []
+    if config["configurable"].get("cypher_gen_method") == 'auto':
+        cypher_gen_chain = get_cypher_gen_chain()
+        cyphers = cypher_gen_chain.invoke({
+            "schema": graph.schema,
+            "question": question,
+            "concepts": related_concepts
+        })
+    if config["configurable"].get("cypher_gen_method") == 'guided':
+        concept_selection_chain = get_concept_selection_chain()
+        print(f"Concept selection chain is : {concept_selection_chain}")
+        selected_topic = concept_selection_chain.invoke({"question" : question, "concepts": get_concepts(graph)})
+        print(f"Selected topic are : {selected_topic}")
+        cyphers = [generate_cypher_from_topic(selected_topic, state['current_plan_step'])]
+        print(f"Cyphers are : {cyphers}")
+    if config["configurable"].get("validate_cypher"):
+        corrector_schema = [Schema(el["start"], el["type"], el["end"]) for el in graph.structured_schema.get("relationships")]
+        cypher_corrector = CypherQueryCorrector(corrector_schema)
+        cyphers = [cypher_corrector(cypher) for cypher in cyphers]
+    return {"cyphers" : cyphers}
+def generate_cypher_from_topic(selected_concept: str, plan_step: int):
+    """
+    Helper function used when the 'guided' cypher generation method has been configured
+    """
+    print(f"L.176 PLAN STEP : {plan_step}")
+    cypher_el = "(n) return n.title, n.description"
+    match plan_step:
+        case 0:
+            cypher_el = "(ts:TechnicalSpecification) RETURN ts.title, ts.scope, ts.description"
+        case 1:
+            cypher_el = "(rp:ResearchPaper) RETURN rp.title, rp.abstract"
+        case 2:
+            cypher_el = "(ki:KeyIssue) RETURN ki.description"
+    return f"MATCH (c:Concept {{name:'{selected_concept}'}})-[:RELATED_TO]-{cypher_el}"
+def get_docs(state:DocRetrieverState, config:ConfigSchema):
+    """
+    This node retrieves docs from the graph using the generated cypher
+    """
+    graph = config["configurable"].get("graph")
+    output = []
+    if graph is not None:
+        for cypher in state["cyphers"]:
+            try:
+                output = graph.query(cypher)
+                break
+            except Exception as e:
+                print("Failed to retrieve docs : {e}")
+    # Clean up the docs we received as there may be duplicates depending on the cypher query
+    all_docs = []
+    for doc in output:
+        unwinded_doc = {}
+        for key in doc:
+            if isinstance(doc[key], dict):
+                all_docs.append(doc[key])
+            else:
+                unwinded_doc.update({key: doc[key]})
+        if unwinded_doc:
+            all_docs.append(unwinded_doc)
+    filtered_docs = []
+    for doc in all_docs:
+        if doc not in filtered_docs:
+            filtered_docs.append(doc)
+    return {"docs": filtered_docs}
+# Data model
+class GradeDocumentsBinary(BaseModel):
+    """Binary score for relevance check on retrieved documents."""
+    binary_score: str = Field(
+        description="Documents are relevant to the question, 'yes' or 'no'"
+    )
+# LLM with function call
+# llm_grader_binary = ChatGroq(model="mixtral-8x7b-32768", temperature=0)
+def get_binary_grader(model="mixtral-8x7b-32768"):
+    """
+    Returns a binary grader to evaluate relevance of documents using specified model for generation
+    This is used when the 'binary' evaluation method has been configured
+    """
+    if model == "gpt-4o":
+        llm_grader_binary = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/", temperature=0)
+    else:
+        llm_grader_binary = ChatGroq(model="mixtral-8x7b-32768", temperature=0)
+    structured_llm_grader_binary = llm_grader_binary.with_structured_output(GradeDocumentsBinary)
+    retrieval_grader_binary = BINARY_GRADER_PROMPT | structured_llm_grader_binary
+    return retrieval_grader_binary
+class GradeDocumentsScore(BaseModel):
+    """Score for relevance check on retrieved documents."""
+    score: float = Field(
+        description="Documents are relevant to the question, score between 0 (completely irrelevant) and 1 (perfectly relevant)"
+    )
+def get_score_grader(model="mixtral-8x7b-32768"):
+    """
+    Returns a score grader to evaluate relevance of documents using specified model for generation
+    This is used when the 'score' evaluation method has been configured
+    """
+    if model == "gpt-4o":
+        llm_grader_score = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/", temperature=0)
+    else:
+        llm_grader_score = ChatGroq(model="mixtral-8x7b-32768", temperature = 0)
+    structured_llm_grader_score = llm_grader_score.with_structured_output(GradeDocumentsScore)
+    retrieval_grader_score = SCORE_GRADER_PROMPT | structured_llm_grader_score
+    return retrieval_grader_score
+def eval_doc(doc, query, method="binary", threshold=0.7, eval_model="mixtral-8x7b-32768"):
+    '''
+    doc : the document to evaluate
+    query : the query to which to doc shoud be relevant
+    method : "binary" or "score"
+    threshold : for "score" method, score above which a doc is considered relevant
+    '''
+    if method == "binary":
+        retrieval_grader_binary = get_binary_grader(model=eval_model)
+        return 1 if (retrieval_grader_binary.invoke({"question": query, "document":doc}).binary_score == 'yes') else 0
+    elif method == "score":
+        retrieval_grader_score = get_score_grader(model=eval_model)
+        score = retrieval_grader_score.invoke({"query": query, "document":doc}).score or None
+        if score is not None:
+            return score if score >= threshold else 0
+        else:
+            # Couldn't parse score, marking document as relevant by default
+            return 1
+    else:
+        raise ValueError("Invalid method")
+def eval_docs(state: DocRetrieverState, config: ConfigSchema):
+    """
+    This node performs evaluation of the retrieved docs and
+    """
+    eval_method =  config["configurable"].get("eval_method") or "binary"
+    MAX_DOCS = config["configurable"].get("max_docs") or 15
+    valid_doc_scores = []
+    for doc in sample(state["docs"], min(25, len(state["docs"]))):
+        score = eval_doc(
+                        doc=format_doc(doc),
+                        query=state["query"],
+                        method=eval_method,
+                        threshold=config["configurable"].get("eval_threshold") or 0.7,
+                        eval_model = config["configurable"].get("eval_model") or "mixtral-8x7b-32768"
+                        )
+        if score:
+            valid_doc_scores.append((doc, score))
+    if eval_method == 'score':
+        # Get at most MAX_DOCS items with the highest score if score method was used
+        valid_docs = sorted(valid_doc_scores, key=lambda x: x[1])
+        valid_docs = [valid_doc[0] for valid_doc in valid_docs[:MAX_DOCS]]
+    else:
+        # Get at mots MAX_DOCS items at random if binary method was used
+        shuffle(valid_doc_scores)
+        valid_docs = [valid_doc[0] for valid_doc in valid_doc_scores[:MAX_DOCS]]
+    return {"valid_docs": valid_docs + (state["valid_docs"] or [])}
+def build_data_retriever_graph(memory):
+    """
+    Builds the data_retriever graph
+    """
+    graph_builder_doc_retriever = StateGraph(DocRetrieverState)
+    graph_builder_doc_retriever.add_node("generate_cypher", generate_cypher)
+    graph_builder_doc_retriever.add_node("get_docs", get_docs)
+    graph_builder_doc_retriever.add_node("eval_docs", eval_docs)
+    graph_builder_doc_retriever.add_edge("__start__", "generate_cypher")
+    graph_builder_doc_retriever.add_edge("generate_cypher", "get_docs")
+    graph_builder_doc_retriever.add_edge("get_docs", "eval_docs")
+    graph_builder_doc_retriever.add_edge("eval_docs", "__end__")
+    graph_doc_retriever = graph_builder_doc_retriever.compile(checkpointer=memory)
+    return graph_doc_retriever

ki_gen/planner.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+import re
+from typing import Annotated
+from typing_extensions import TypedDict
+from langchain_groq import ChatGroq
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_community.graphs import Neo4jGraph
+from langgraph.graph import StateGraph
+from langgraph.graph import add_messages
+from ki_gen.prompts import PLAN_GEN_PROMPT, PLAN_MODIFICATION_PROMPT
+from ki_gen.data_retriever import build_data_retriever_graph
+from ki_gen.data_processor import build_data_processor_graph
+from ki_gen.utils import ConfigSchema, State, HumanValidationState, DocProcessorState, DocRetrieverState
+##########################################################################
+######                       NODES DEFINITION                       ######
+##########################################################################
+def validate_node(state: State):
+    """
+    This node inserts the plan validation prompt.
+    """
+    prompt = """System : You only need to focus on Key Issues, no need to focus on solutions or stakeholders yet and your plan should be concise.
+If needed, give me an updated plan to follow this instruction. If your plan already follows the instruction just say "My plan is correct"."""
+    output = HumanMessage(content=prompt)
+    return {"messages" : [output]}
+# Wrappers to call LLMs on the state messsages field
+def chatbot_llama(state: State):
+    llm_llama = ChatGroq(model="llama3-70b-8192")
+    return {"messages" : [llm_llama.invoke(state["messages"])]}
+def chatbot_mixtral(state: State):
+    llm_mixtral = ChatGroq(model="mixtral-8x7b-32768")
+    return {"messages" : [llm_mixtral.invoke(state["messages"])]}
+def chatbot_openai(state: State):
+    llm_openai = ChatOpenAI(model='gpt-4o', base_url="https://llm.synapse.thalescloud.io/")
+    return {"messages" : [llm_openai.invoke(state["messages"])]}
+chatbots = {"gpt-4o" : chatbot_openai,
+            "mixtral-8x7b-32768" : chatbot_mixtral,
+            "llama3-70b-8192" : chatbot_llama
+            }
+def parse_plan(state: State):
+    """
+    This node parses the generated plan and writes in the 'store_plan' field of the state
+    """
+    plan = state["messages"][-3].content
+    store_plan = re.split("\d\.", plan.split("Plan:\n")[1])[1:]
+    try:
+        store_plan[len(store_plan) - 1] = store_plan[len(store_plan) - 1].split("<END_OF_PLAN>")[0]
+    except Exception as e:
+        print(f"Error while removing <END_OF_PLAN> : {e}")
+    return {"store_plan" : store_plan}
+def detail_step(state: State, config: ConfigSchema):
+    """
+    This node updates the value of the 'current_plan_step' field and defines the query to be used for the data_retriever.
+    """
+    print("test")
+    print(state)
+    if 'current_plan_step' in state.keys():
+        print("all good chief")
+    else:
+        state["current_plan_step"] = None
+    current_plan_step = state["current_plan_step"] + 1 if state["current_plan_step"] is not None else 0 # We just began a new step so we will increase current_plan_step at the end
+    if config["configurable"].get("use_detailed_query"):
+        prompt = HumanMessage(f"""Specify what additional information you need to proceed with the next step of your plan :
+                            Step {current_plan_step + 1} : {state['store_plan'][current_plan_step]}""")
+        query = get_detailed_query(context = state["messages"] + [prompt], model=config["configurable"].get("main_llm"))
+        return {"messages" : [prompt, query], "current_plan_step": current_plan_step, 'query' : query}
+    return {"current_plan_step": current_plan_step, 'query' : state["store_plan"][current_plan_step], "valid_docs" : []}
+def get_detailed_query(context : list, model : str = "mixtral-8x7b-32768"):
+    """
+    Simple helper function for the detail_step node
+    """
+    if model == 'gpt-4o':
+        llm = ChatOpenAI(model=model, base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm = ChatGroq(model=model)
+    return llm.invoke(context)
+def concatenate_data(state: State):
+    """
+    This node concatenates all the data that was processed by the data_processor and inserts it in the state's messages
+    """
+    prompt = f"""#########TECHNICAL INFORMATION ############
+{str(state["valid_docs"])}
+########END OF TECHNICAL INFORMATION#######
+Using the information provided above, proceed with step {state['current_plan_step'] + 1} of your plan :
+    {state['store_plan'][state['current_plan_step']]}
+"""
+    return {"messages": [HumanMessage(content=prompt)]}
+def human_validation(state: HumanValidationState) -> HumanValidationState:
+    """
+    Dummy node to interrupt before
+    """
+    return {'process_steps' : []}
+def generate_ki(state: State):
+    """
+    This node inserts the prompt to begin Key Issues generation
+    """
+    print(f"THIS IS THE STATE FOR CURRENT PLAN STEP IN GENERATE_KI : {state}")
+    prompt = f"""Using the information provided above, proceed with step 4 of your plan to provide the user with NEW and INNOVATIVE Key Issues :
+    {state['store_plan'][state['current_plan_step'] + 1]}"""
+    return {"messages" : [HumanMessage(content=prompt)]}
+def detail_ki(state: State):
+    """
+    This node inserts the last prompt to detail the generated Key Issues
+    """
+    prompt = f"""Using the information provided above, proceed with step 5 of your plan to provide the user with NEW and INNOVATIVE Key Issues :
+    {state['store_plan'][state['current_plan_step'] + 2]}"""
+    return {"messages" : [HumanMessage(content=prompt)]}
+##########################################################################
+######                    CONDITIONAL EDGE FUNCTIONS                ######
+##########################################################################
+def validate_plan(state: State):
+    """
+    Whether to regenerate the plan or to parse it
+    """
+    if "messages" in state and state["messages"][-1].content in ["My plan is correct.","My plan is correct"]:
+        return "parse"
+    return "validate"
+def next_plan_step(state: State, config: ConfigSchema):
+    """
+    Proceed to next plan step (either generate KI or retrieve more data)
+    """
+    if (state["current_plan_step"] == 2) and (config["configurable"].get('plan_method') == "modification"):
+        return "generate_key_issues"
+    if state["current_plan_step"] == len(state["store_plan"]) - 1:
+        return "generate_key_issues"
+    else:
+        return "detail_step"
+def detail_or_data_retriever(state: State, config: ConfigSchema):
+    """
+    Detail the query to use for data retrieval or not
+    """
+    if config["configurable"].get("use_detailed_query"):
+        return "chatbot_detail"
+    else:
+        return "data_retriever"
+def retrieve_or_process(state: State):
+    """
+    Process the retrieved docs or keep retrieving
+    """
+    if state['human_validated']:
+        return "process"
+    return "retrieve"
+    # while True:
+    #     user_input = input(f"{len(state['valid_docs'])} were retreived. Do you want more documents (y/[n]) : ")
+    #     if user_input.lower() == "y":
+    #         return "retrieve"
+    #     if not user_input or user_input.lower() == "n":
+    #         return "process"
+    #     print("Please answer with 'y' or 'n'.\n")
+def build_planner_graph(memory, config):
+    """
+    Builds the planner graph
+    """
+    graph_builder = StateGraph(State)
+    graph_doc_retriever = build_data_retriever_graph(memory)
+    graph_doc_processor = build_data_processor_graph(memory)
+    graph_builder.add_node("chatbot_planner", chatbots[config["main_llm"]])
+    graph_builder.add_node("validate", validate_node)
+    graph_builder.add_node("chatbot_detail", chatbot_llama)
+    graph_builder.add_node("parse", parse_plan)
+    graph_builder.add_node("detail_step", detail_step)
+    graph_builder.add_node("data_retriever", graph_doc_retriever, input=DocRetrieverState)
+    graph_builder.add_node("human_validation", human_validation)
+    graph_builder.add_node("data_processor", graph_doc_processor, input=DocProcessorState)
+    graph_builder.add_node("concatenate_data", concatenate_data)
+    graph_builder.add_node("chatbot_exec_step", chatbots[config["main_llm"]])
+    graph_builder.add_node("generate_ki", generate_ki)
+    graph_builder.add_node("chatbot_ki", chatbots[config["main_llm"]])
+    graph_builder.add_node("detail_ki", detail_ki)
+    graph_builder.add_node("chatbot_final", chatbots[config["main_llm"]])
+    graph_builder.add_edge("validate", "chatbot_planner")
+    graph_builder.add_edge("parse", "detail_step")
+    # graph_builder.add_edge("detail_step", "chatbot2")
+    graph_builder.add_edge("chatbot_detail", "data_retriever")
+    graph_builder.add_edge("data_retriever", "human_validation")
+    graph_builder.add_edge("data_processor", "concatenate_data")
+    graph_builder.add_edge("concatenate_data", "chatbot_exec_step")
+    graph_builder.add_edge("generate_ki", "chatbot_ki")
+    graph_builder.add_edge("chatbot_ki", "detail_ki")
+    graph_builder.add_edge("detail_ki", "chatbot_final")
+    graph_builder.add_edge("chatbot_final", "__end__")
+    graph_builder.add_conditional_edges(
+        "detail_step",
+        detail_or_data_retriever,
+        {"chatbot_detail": "chatbot_detail", "data_retriever": "data_retriever"}
+    )
+    graph_builder.add_conditional_edges(
+        "human_validation",
+        retrieve_or_process,
+        {"retrieve" : "data_retriever", "process" : "data_processor"}
+    )
+    graph_builder.add_conditional_edges(
+        "chatbot_planner",
+        validate_plan,
+        {"parse" : "parse", "validate": "validate"}
+    )
+    graph_builder.add_conditional_edges(
+        "chatbot_exec_step",
+        next_plan_step,
+        {"generate_key_issues" : "generate_ki", "detail_step": "detail_step"}
+    )
+    graph_builder.set_entry_point("chatbot_planner")
+    graph = graph_builder.compile(
+        checkpointer=memory,
+        interrupt_after=["parse", "chatbot_exec_step", "chatbot_final", "data_retriever"],
+    )
+    return graph

ki_gen/prompts.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from langchain_core.prompts.prompt import PromptTemplate
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import SystemMessage, HumanMessage
+from ki_gen.utils import ConfigSchema
+CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
+Instructions:
+Use only the provided relationship types and properties in the schema.
+Do not use any other relationship types or properties that are not provided.
+Schema:
+{schema}
+Concepts:
+{concepts}
+Concept names can ONLY be selected from the above list
+Note: Do not include any explanations or apologies in your responses.
+Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
+Do not include any text except the generated Cypher statement.
+The question is:
+{question}"""
+CYPHER_GENERATION_PROMPT = PromptTemplate(
+    input_variables=["schema", "question", "concepts"], template=CYPHER_GENERATION_TEMPLATE
+)
+CYPHER_QA_TEMPLATE = """You are an assistant that helps to form nice and human understandable answers.
+The information part contains the provided information that you must use to construct an answer.
+The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
+Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
+Here is an example:
+Question: Which managers own Neo4j stocks?
+Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
+Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
+Follow this example when generating answers.
+If the provided information is empty, say that you don't know the answer.
+Information:
+{context}
+Question: {question}
+Helpful Answer:"""
+CYPHER_QA_PROMPT = PromptTemplate(
+    input_variables=["context", "question"], template=CYPHER_QA_TEMPLATE
+)
+PLAN_GEN_PROMPT = """System : You are a standardization expert working for 3GPP. You are given a specific technical requirement regarding the deployment of 5G services. Your goal is to specify NEW and INNOVATIVE Key Issues that could occur while trying to fulfill this requirement
+System : Let's first understand the problem and devise a plan to solve the problem.
+Output the plan starting with the header 'Plan:' and then followed by a numbered list of steps.
+Make the plan the minimum number of steps required to accurately provide the user with NEW and INNOVATIVE Key Issues related to the technical requirement.
+At the end of your plan, say '<END_OF_PLAN>'"""
+PLAN_MODIFICATION_PROMPT = """You are a standardization expert working for 3GPP. You are given a specific technical requirement regarding the deployment of 5G services. Your goal is to specify NEW and INNOVATIVE Key Issues that could occur while trying to fulfill this requirement.
+To achieve this goal we are going to follow this generic plan :
+###PLAN TEMPLATE###
+Plan:
+1. **Understanding the Problem**: Gather information from existing specifications and standards to thoroughly understand the technical requirement. This should help you understand the key aspects of the problem.
+2. **Gather information about latest innovations** : Gather information about the latest innovations related to the problem by looking at the most relevant research papers.
+3. **Researching current challenges** : Research the current challenges in this area by looking at the existing similar key issues that have been identified by 3GPP.
+4. **Identifying NEW and INNOVATIVE Key Issues**: Based on the understanding of the problem and the current challenges, identify new and innovative key issues that could occur while trying to fulfill this requirement. These key issues should be relevant, significant, and not yet addressed by existing solutions.
+5. **Develop Detailed Descriptions for Each Key Issue**: For each identified key issue, provide a detailed description, including the specific challenges and areas requiring further study.
+<END_OF_PLAN>
+###END OF PLAN TEMPLATE###
+Let's and devise a plan to solve the problem by adapting the PLAN TEMPLATE.
+Output the plan starting with the header 'Plan:' and then followed by a numbered list of steps.
+Make the plan the minimum number of steps required to accurately provide the user with NEW and INNOVATIVE Key Issues related to the technical requirement.
+At the end of your plan, say '<END_OF_PLAN>' """
+CONCEPT_SELECTION_TEMPLATE = """Task: Select the most relevant topic to the user question
+Instructions:
+Select the most relevant Concept to the user's question.
+Concepts can ONLY be selected from the list below.
+Concepts:
+{concepts}
+Note: Do not include any explanations or apologies in your responses.
+Do not include any text except the selected concept.
+The question is:
+{question}"""
+CONCEPT_SELECTION_PROMPT = PromptTemplate(
+    input_variables=["concepts", "question"], template=CONCEPT_SELECTION_TEMPLATE
+)
+RELEVANT_CONCEPTS_TEMPLATE = """
+## CONCEPTS ##
+{concepts}
+## END OF CONCEPTS ##
+Select the 20 most relevant concepts to the user query.
+Output your answer as a numbered list preceeded with the header 'Concepts:'.
+User query :
+{user_query}
+"""
+RELEVANT_CONCEPTS_PROMPT = ChatPromptTemplate.from_messages([
+    ("human", RELEVANT_CONCEPTS_TEMPLATE)
+])
+SUMMARIZER_TEMPLATE = """You are a 3GPP standardization expert.
+Summarize the provided document in simple technical English for other experts in the field.
+Document:
+{document}"""
+SUMMARIZER_PROMPT = ChatPromptTemplate.from_messages([
+        ("system", SUMMARIZER_TEMPLATE)
+    ])
+BINARY_GRADER_TEMPLATE = """You are a grader assessing relevance of a retrieved document to a user question. \n
+    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
+    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
+    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
+BINARY_GRADER_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        ("system", BINARY_GRADER_TEMPLATE),
+        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
+    ]
+)
+SCORE_GRADER_TEMPLATE = """Grasp and understand both the query and the document before score generation.
+Then, based on your understanding and analysis quantify the relevance between the document and the query.
+Give the rationale before answering.
+Ouput your answer as a score ranging between 0 (irrelevant document) and 1 (completely relevant document)"""
+SCORE_GRADER_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        ("system", SCORE_GRADER_TEMPLATE),
+        ("human", "Passage: \n\n {document} \n\n User query: {query}")
+    ]
+)
+def get_initial_prompt(config: ConfigSchema, user_query : str):
+    if config["configurable"].get("plan_method") == "generation":
+        prompt = PLAN_GEN_PROMPT
+    elif config["configurable"].get("plan_method") == "modification":
+        prompt = PLAN_MODIFICATION_PROMPT
+    else:
+        raise ValueError("Incorrect plan_method, should be 'generation' or 'modification'")
+    user_input = user_query or input("User :")
+    return {"messages" : [SystemMessage(content=prompt), HumanMessage(content=user_input)]}

ki_gen/utils.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+import getpass
+import html
+from typing import Annotated, Union
+from typing_extensions import TypedDict
+from langchain_community.graphs import Neo4jGraph
+from langchain_groq import ChatGroq
+from langchain_openai import ChatOpenAI
+from langgraph.checkpoint.sqlite import SqliteSaver
+from langgraph.checkpoint import base
+from langgraph.graph import add_messages
+with SqliteSaver.from_conn_string(":memory:") as mem :
+    memory = mem
+def format_df(df):
+    """
+    Used to display the generated plan in a nice format
+    Returns html code in a string
+    """
+    def format_cell(cell):
+        if isinstance(cell, str):
+            # Encode special characters, but preserve line breaks
+            return html.escape(cell).replace('\n', '<br>')
+        return cell
+    # Convert the DataFrame to HTML with custom CSS
+    formatted_df = df.map(format_cell)
+    html_table = formatted_df.to_html(escape=False, index=False)
+    # Add custom CSS to allow multiple lines and scrolling in cells
+    css = """
+    <style>
+        table {
+            border-collapse: collapse;
+            width: 100%;
+        }
+        th, td {
+            border: 1px solid black;
+            padding: 8px;
+            text-align: left;
+            vertical-align: top;
+            white-space: pre-wrap;
+            max-width: 300px;
+            max-height: 100px;
+            overflow-y: auto;
+        }
+        th {
+            background-color: #f2f2f2;
+        }
+    </style>
+    """
+    return css + html_table
+def format_doc(doc: dict) -> str :
+    formatted_string = ""
+    for key in doc:
+        formatted_string += f"**{key}**: {doc[key]}\n"
+    return formatted_string
+def _set_env(var: str, value: str = None):
+    if not os.environ.get(var):
+        if value:
+            os.environ[var] = value
+        else:
+            os.environ[var] = getpass.getpass(f"{var}: ")
+def init_app(openai_key : str = None, groq_key : str = None, langsmith_key : str = None):
+    """
+    Initialize app with user api keys and sets up proxy settings
+    """
+    _set_env("GROQ_API_KEY", value=groq_key)
+    _set_env("LANGSMITH_API_KEY", value=langsmith_key)
+    _set_env("OPENAI_API_KEY", value=openai_key)
+    os.environ["LANGSMITH_TRACING_V2"] = "true"
+    os.environ["LANGCHAIN_PROJECT"] = "3GPP Test"
+    os.environ["http_proxy"] = "185.46.212.98:80"
+    os.environ["https_proxy"] = "185.46.212.98:80"
+    os.environ["NO_PROXY"] = "thalescloud.io"
+def clear_memory(memory, thread_id: str) -> None:
+    """
+    Clears checkpointer state for a given thread_id, broken for now
+    TODO : fix this
+    """
+    with SqliteSaver.from_conn_string(":memory:") as mem :
+        memory = mem
+    checkpoint = base.empty_checkpoint()
+    memory.put(config={"configurable": {"thread_id": thread_id}}, checkpoint=checkpoint, metadata={})
+def get_model(model : str = "mixtral-8x7b-32768"):
+    """
+    Wrapper to return the correct llm object depending on the 'model' param
+    """
+    if model == "gpt-4o":
+        llm = ChatOpenAI(model=model, base_url="https://llm.synapse.thalescloud.io/")
+    else:
+        llm = ChatGroq(model=model)
+    return llm
+class ConfigSchema(TypedDict):
+    graph: Neo4jGraph
+    plan_method: str
+    use_detailed_query: bool
+class State(TypedDict):
+    messages : Annotated[list, add_messages]
+    store_plan : list[str]
+    current_plan_step : int
+    valid_docs : list[str]
+class DocRetrieverState(TypedDict):
+    messages: Annotated[list, add_messages]
+    query: str
+    docs: list[dict]
+    cyphers: list[str]
+    current_plan_step : int
+    valid_docs: list[Union[str, dict]]
+class HumanValidationState(TypedDict):
+    human_validated : bool
+    process_steps : list[str]
+def update_doc_history(left : list | None, right : list | None) -> list:
+    """
+    Reducer for the 'docs_in_processing' field.
+    Doesn't work currently because of bad handlinf of duplicates
+    TODO : make this work (reference : https://langchain-ai.github.io/langgraph/how-tos/subgraph/#custom-reducer-functions-to-manage-state)
+    """
+    if not left:
+        # This shouldn't happen
+        left = [[]]
+    if not right:
+        right = []
+    for i in range(len(right)):
+        left[i].append(right[i])
+    return left
+class DocProcessorState(TypedDict):
+    valid_docs : list[Union[str, dict]]
+    docs_in_processing : list
+    process_steps : list[Union[str,dict]]
+    current_process_step : int