KIG

Sleeping

App Files Files Community

KIG / kig_core /graph_operations.py

heymenn

Update kig_core/graph_operations.py

65bd809 verified about 1 month ago

raw

history blame contribute delete

10.6 kB

	import re
	import logging
	import time
	from typing import List, Dict, Any, Optional, Tuple
	from random import sample, shuffle

	from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
	from langchain_core.runnables import Runnable, RunnablePassthrough
	from langchain_core.pydantic_v1 import Field, BaseModel as V1BaseModel # For grader models if needed

	from .config import settings
	from .graph_client import neo4j_client # Use the central client
	from .llm_interface import get_llm, invoke_llm
	from .prompts import (
	CYPHER_GENERATION_PROMPT, CONCEPT_SELECTION_PROMPT,
	BINARY_GRADER_PROMPT, SCORE_GRADER_PROMPT
	)
	from .schemas import KeyIssue # Import if needed here, maybe not

	logger = logging.getLogger(__name__)

	# --- Helper Functions ---
	def extract_cypher(text: str) -> str:
	"""Extracts the first Cypher code block or returns the text itself."""
	pattern = r"```(?:cypher)?\s(.?)\s*```"
	match = re.search(pattern, text, re.DOTALL \| re.IGNORECASE)
	return match.group(1).strip() if match else text.strip()

	def format_doc_for_llm(doc: Dict[str, Any]) -> str:
	"""Formats a document dictionary into a string for LLM context."""
	return "\n".join(f"{key}: {value}" for key, value in doc.items() if value)

	def format_summary_for_llm(doc: Dict[str, Any]) -> str:
	"""Formats a document dictionnary to retrieve the summary of each document into a string for LLM context."""
	print(f"doc is : {doc}")
	data = ""
	for key, value in doc.items():
	if key == "s.summary" and value != None:
	data += f"The document is a key issue. Solution of the key issue summarized : {value}"
	if key == "n.summary" and value != None:
	data += f"Document summarized : {value}"
	print(f"data is : {data}")
	return data

	# --- Cypher Generation ---
	def generate_cypher_auto(question: str) -> str:
	"""Generates Cypher using the 'auto' method."""
	logger.info("Generating Cypher using 'auto' method.")
	# Schema fetching needs implementation if required by the prompt/LLM
	# schema_info = neo4j_client.get_schema() # Placeholder
	schema_info = "Schema not available." # Default if not implemented

	cypher_llm = get_llm(settings.main_llm_model) # Or a specific cypher model
	chain = (
	{"question": RunnablePassthrough(), "schema": lambda x: schema_info}
	\| CYPHER_GENERATION_PROMPT
	\| cypher_llm
	\| StrOutputParser()
	\| extract_cypher
	)
	return invoke_llm(chain,question)

	def generate_cypher_guided(question: str, plan_step: int) -> str:
	"""Generates Cypher using the 'guided' method based on concepts."""
	logger.info(f"Generating Cypher using 'guided' method for plan step {plan_step}.")
	try:
	concepts = neo4j_client.get_concepts()
	if not concepts:
	logger.warning("No concepts found in Neo4j for guided cypher generation.")
	return "" # Or raise error

	concept_llm = get_llm(settings.main_llm_model) # Or a specific concept model
	concept_chain = (
	CONCEPT_SELECTION_PROMPT
	\| concept_llm
	\| StrOutputParser()
	)
	selected_concept = invoke_llm(concept_chain,{
	"question": question,
	"concepts": "\n".join(concepts)
	}).strip()

	logger.info(f"Concept selected by LLM: {selected_concept}")

	# Basic check if the selected concept is valid
	if selected_concept not in concepts:
	logger.warning(f"LLM selected concept '{selected_concept}' not in the known list. Attempting fallback or ignoring.")
	# Optional: Add fuzzy matching or similarity search here
	# For now, we might default or return empty
	# Let's try a simple substring check as a fallback
	found_match = None
	for c in concepts:
	if selected_concept.lower() in c.lower():
	found_match = c
	logger.info(f"Found potential match: '{found_match}'")
	break
	if not found_match:
	logger.error(f"Could not validate selected concept: {selected_concept}")
	return "" # Return empty query if concept is invalid
	selected_concept = found_match

	# Determine the target node type based on plan step (example logic)
	# This mapping might need adjustment based on the actual plan structure
	if plan_step <= 1: # Steps 0 and 1: Context gathering
	target = "(ts:TechnicalSpecification)"
	fields = "ts.title, ts.scope, ts.description, ts.summary"
	elif plan_step == 2: # Step 2: Research papers?
	target = "(rp:ResearchPaper)"
	fields = "rp.title, rp.abstract, rp.summary"
	else: # Later steps might involve KeyIssues themselves or other types
	target = "(n) OPTIONAL MATCH (s:Solution)-[:SOLUTION_OF]->(n) WHERE (n):KeyIssue" # Generic fallback
	fields = "n.title, n.description, n.summary, s.summary" # Assuming common fields

	# Construct Cypher query
	# Ensure selected_concept is properly escaped if needed, though parameters are safer

	cypher = f"MATCH (c:Concept {{name: $conceptName}})-[:RELATED_TO]-{target} RETURN {fields}"
	# We return the query and the parameters separately for safe execution
	# However, the planner currently expects just the string. Let's construct it directly for now.
	# Be cautious about injection if concept names can contain special chars. Binding is preferred.
	escaped_concept = selected_concept.replace("'", "\\'") # Basic escaping
	cypher = f"MATCH (c:Concept {{name: '{escaped_concept}'}})-[:RELATED_TO]-{target} RETURN {fields}"
	print(f"Cypher request is : {cypher}")
	logger.info(f"Generated guided Cypher: {cypher}")
	return cypher

	except Exception as e:
	logger.error(f"Error during guided cypher generation: {e}", exc_info=True)
	time.sleep(60)
	return "" # Return empty on error


	# --- Document Retrieval ---
	def retrieve_documents(cypher_query: str) -> List[Dict[str, Any]]:
	"""Retrieves documents from Neo4j using a Cypher query."""
	if not cypher_query:
	logger.warning("Received empty Cypher query, skipping retrieval.")
	return []
	logger.info(f"Retrieving documents with Cypher: {cypher_query} limit 10")
	try:
	# Use the centralized client's query method
	raw_results = neo4j_client.query(cypher_query + " limit 10")
	# Basic cleaning/deduplication (can be enhanced)
	processed_results = []
	seen = set()
	for doc in raw_results:
	# Create a frozenset of items for hashable representation to detect duplicates
	doc_items = frozenset(doc.items())
	if doc_items not in seen:
	processed_results.append(doc)
	seen.add(doc_items)
	logger.info(f"Retrieved {len(processed_results)} unique documents.")
	return processed_results
	except (ConnectionError, ValueError, RuntimeError) as e:
	# Errors already logged in neo4j_client
	logger.error(f"Document retrieval failed: {e}")
	return [] # Return empty list on failure


	# --- Document Evaluation ---
	# Define Pydantic models for structured LLM grader output (if not using built-in LCEL structured output)
	class GradeDocumentsBinary(V1BaseModel):
	"""Binary score for relevance check."""
	binary_score: str = Field(description="Relevant? 'yes' or 'no'")

	class GradeDocumentsScore(V1BaseModel):
	"""Score for relevance check."""
	rationale: str = Field(description="Rationale for the score.")
	score: float = Field(description="Relevance score (0.0 to 1.0)")

	def evaluate_documents(
	docs: List[Dict[str, Any]],
	query: str
	) -> List[Dict[str, Any]]:
	"""Evaluates document relevance to a query using configured method."""
	if not docs:
	return []

	logger.info(f"Evaluating {len(docs)} documents for relevance to query: '{query}' using method: {settings.eval_method}")
	eval_llm = get_llm(settings.eval_llm_model)
	valid_docs_with_scores: List[Tuple[Dict[str, Any], float]] = []

	# Consider using LCEL's structured output capabilities directly if the model supports it well
	# This simplifies parsing. Example for binary:
	# binary_grader = BINARY_GRADER_PROMPT \| eval_llm.with_structured_output(GradeDocumentsBinary)

	if settings.eval_method == "binary":
	binary_grader = BINARY_GRADER_PROMPT \| eval_llm \| StrOutputParser() # Fallback to string parsing
	for doc in docs:
	formatted_doc = format_doc_for_llm(doc)
	if not formatted_doc.strip(): continue
	try:
	result = invoke_llm(binary_grader,{"question": query, "document": formatted_doc})
	logger.debug(f"Binary grader result for doc '{doc.get('title', 'N/A')}': {result}")
	if result and 'yes' in result.lower():
	valid_docs_with_scores.append((doc, 1.0))
	except Exception as e:
	logger.warning(f"Binary grading failed for a document: {e}", exc_info=True)

	elif settings.eval_method == "score":
	# Using JSON parser as a robust fallback for score extraction
	score_grader = SCORE_GRADER_PROMPT \| eval_llm \| JsonOutputParser(pydantic_object=GradeDocumentsScore)
	for doc in docs:
	formatted_doc = format_doc_for_llm(doc)
	if not formatted_doc.strip(): continue
	try:
	result: GradeDocumentsScore = invoke_llm(score_grader,{"query": query, "document": formatted_doc})
	logger.debug(f"Score grader result for doc '{doc.get('title', 'N/A')}': Score={result.score}, Rationale={result.rationale}")
	if result.score >= settings.eval_threshold:
	valid_docs_with_scores.append((doc, result.score))
	except Exception as e:
	logger.warning(f"Score grading failed for a document: {e}", exc_info=True)
	# Optionally treat as relevant on failure? Or skip? Skipping for now.

	# Sort by score if applicable, then limit
	if settings.eval_method == 'score':
	valid_docs_with_scores.sort(key=lambda item: item[1], reverse=True)

	# Limit to max_docs
	final_docs = [doc for doc, score in valid_docs_with_scores[:settings.max_docs]]
	logger.info(f"Found {len(final_docs)} relevant documents after evaluation and filtering.")

	return final_docs