Spaces:

sivan22
/

Divrey-Yoel-RAG

Sleeping

File size: 14,372 Bytes

7f683f9

# analysis_service_anthropic.py

import anthropic
import os
import json
import re
import traceback

# --- Configuration ---
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
# --- MODEL NAME CHANGED TO SPECIFIC VERSION, LATEST AS COMMENT ---
# Use the specific dated version for potentially more stable results in production.
# 'latest' alias points to this or newer snapshots.
# Original: ANALYSIS_MODEL = "claude-3-7-sonnet-latest"
ANALYSIS_MODEL = "claude-3-7-sonnet-20250219" # Or use "claude-3-7-sonnet-latest"
# --- End Configuration ---

client = None
if ANTHROPIC_API_KEY:
    try:
        # Use the specific model name in initialization log
        client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
        print(
            f"Anthropic ASYNC client initialized for analysis (Model: {ANALYSIS_MODEL})."
        )
    except Exception as e:
        print(f"Error initializing Anthropic ASYNC client for analysis: {e}")
else:
    print("ANTHROPIC_API_KEY not found. Analysis service will not function.")


# --- Helper functions (clean_json_string, check_analyzer_status, clean_source_text) ---
def clean_json_string(json_string):
    """Attempts to clean common issues in JSON strings returned by LLMs."""
    if not isinstance(json_string, str):
        return ""
    # Remove trailing commas before closing brackets/braces
    cleaned = re.sub(r",(\s*[}\]])", r"\1", json_string)
    # Remove markdown code block fences
    cleaned = re.sub(r"^```json\s*", "", cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r"\s*```$", "", cleaned)
    return cleaned.strip()

def check_analyzer_status():
    """Checks if the Anthropic analyzer service is ready."""
    if not client:
        return False, "Anthropic client not initialized (check API key)."
    return True, f"Analysis service ready (Model: {ANALYSIS_MODEL})." # Show model in status

def clean_source_text(text):
    """Cleans source text by removing specific patterns."""
    if not text: return ""
    # Remove @number patterns, <HAL> tags (case-insensitive), <br> tags, and normalize whitespace
    cleaned = text
    cleaned = re.sub(r'@\d+', '', cleaned)
    cleaned = re.sub(r'<HAL>', '', cleaned, flags=re.IGNORECASE)
    cleaned = cleaned.replace('<br>', ' ').replace('<br />', ' ')
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned
# --- End Helper Functions ---


async def analyze_source_relevance_async(paragraph_hebrew, paragraph_english,
                                         user_question):
    """
    Analyzes a Hebrew text paragraph for relevance to a user's question using Anthropic.

    Args:
        paragraph_hebrew (str): The Hebrew text snippet to analyze.
        paragraph_english (str): The English translation (currently unused but kept for signature consistency).
        user_question (str): The user's question in Hebrew.

    Returns:
        dict or None: A dictionary containing the analysis result (relevance score, headline, conclusion)
                      if successful, otherwise None.
    """
    global client
    ready, msg = check_analyzer_status()
    if not ready or client is None:
        print(f"Analyzer not ready: {msg}")
        return None

    # Ensure inputs are strings, even if empty
    paragraph_hebrew = str(paragraph_hebrew) if paragraph_hebrew is not None else ""
    user_question = str(user_question) if user_question is not None else ""

    if not paragraph_hebrew or not user_question:
        print(
            "Warning: Missing Hebrew paragraph or user question for analysis.")
        # Return a default non-relevant structure instead of None if needed by downstream logic
        # return {"relevance": {"is_relevant": False, "relevance_score": 1, "explanation": "קלט חסר."}, "headline": {"hebrew": "קלט חסר"}, "conclusion": {"hebrew": "קלט חסר."}}
        return None # Keep returning None for now

    original_snippet = paragraph_hebrew[:60].replace('\n', ' ')
    cleaned_hebrew = clean_source_text(paragraph_hebrew)
    cleaned_snippet = cleaned_hebrew[:60].replace('\n', ' ')

    # === START NEW SYSTEM PROMPT ===
    system_prompt = """You are an expert analyst specializing in Chassidic texts, particularly the works of the Satmar Rebbe, Rabbi Yoel Teitelbaum (Divrei Yoel). Your task is to evaluate a single Hebrew paragraph provided by the user based *only* on its relevance to the user's specific Hebrew question.

You MUST output your analysis STRICTLY as a single, valid JSON object, with no other text before or after the JSON structure.

The JSON object must have the following structure:

{
  "relevance": {
    "is_relevant": boolean, // True if the paragraph directly discusses or provides significant information related to the user's question. False otherwise.
    "relevance_score": integer, // A score from 1 (completely irrelevant) to 10 (directly and fully answers a key aspect of the question). Assess based *only* on the provided paragraph content.
    "explanation": string // A concise explanation IN HEBREW justifying the score and relevance assessment, referring only to the content of the paragraph. Explain *why* it is or is not relevant.
  },
  "headline": {
    "hebrew": string // A very brief (3-7 words) headline IN HEBREW summarizing the paragraph's main topic *as it relates to the question*. If irrelevant, summarize the paragraph's general topic.
  },
  "conclusion": {
    "hebrew": string // A single sentence IN HEBREW summarizing the key takeaway or information *from the paragraph* that is relevant to the question. If the paragraph is irrelevant, state clearly in Hebrew that it does not address the question (e.g., "הפסקה אינה עוסקת בשאלה.").
  }
}

Base your entire analysis SOLELY on the Hebrew text paragraph provided. Do not use external knowledge. Ensure the output is valid JSON.
"""
    # === END NEW SYSTEM PROMPT ===

    # === START NEW USER MESSAGE CONTENT ===
    # Ensure inputs are properly escaped if they contain characters that could break JSON structure within the f-string, though unlikely here.
    # Using f-string for clarity, but ensure no direct injection vulnerability if inputs were different.
    user_message_content = f"""Please analyze the following Hebrew text passage based *only* on its content and relevance to the specific Hebrew question provided below. Adhere strictly to the JSON output format specified in the system prompt.

**User Question (Hebrew):**
{user_question}

**Hebrew Text Passage to Analyze:**
<paragraph>
{cleaned_hebrew}
</paragraph>
"""
    # === END NEW USER MESSAGE CONTENT ===

    print(
        f"  -> Sending cleaned paragraph (Snippet: '{cleaned_snippet}...') for Claude analysis (Model: {ANALYSIS_MODEL}) regarding question: '{user_question[:60]}...'"
    )

    try:
        # --- API Call - Adheres to Messages API format ---
        message = await client.messages.create(
            model=ANALYSIS_MODEL,          # Correct parameter
            max_tokens=1024,               # Correct parameter (Estimate response size, maybe smaller?)
            system=system_prompt,          # Correct parameter (Using new detailed prompt)
            messages=[{                    # Correct parameter and structure
                "role": "user",
                "content": user_message_content # Using new detailed content
            }],
            temperature=0.1,               # Lower temperature for deterministic analysis
            # --- Thinking Parameter - Commented Out for Analysis ---
            # thinking={"type": "enabled", "budget_tokens": 16000} # Consider if needed for complex analysis
            # --- End Thinking Parameter ---
        )

        # --- Process the response ---
        if not message or not message.content or not isinstance(message.content, list) or not message.content:
            print(
                f"  <- Analysis failed: Invalid message object or empty content list received from API for snippet '{cleaned_snippet}...'. Message: {message}"
            )
            return None

        first_block = message.content[0]
        if not first_block or not hasattr(first_block, 'text') or not first_block.text:
            print(
                f"  <- Analysis failed: First content block is invalid or has no text for snippet '{cleaned_snippet}...'. First block: {first_block}"
            )
            return None

        raw_response_text = first_block.text
        print(f"  <- Raw analysis response snippet: {raw_response_text[:200]}...") # Log raw response

        # --- JSON Parsing Logic ---
        # Attempt to find JSON block first
        json_match = re.search(r"\{.*\}", raw_response_text, re.DOTALL)
        json_to_parse = None
        if json_match:
            json_block = json_match.group(0)
            # Further clean the extracted block
            json_to_parse = clean_json_string(json_block)
            print(f"  -- Extracted JSON block: {json_to_parse[:100]}...")
        else:
            # If no block found, try cleaning the whole response (less reliable)
            json_to_parse = clean_json_string(raw_response_text)
            print(f"  -- No JSON block found, attempting parse on cleaned full response: {json_to_parse[:100]}...")


        if not json_to_parse or not json_to_parse.startswith("{") or not json_to_parse.endswith("}"):
             print(f"  <- Analysis failed: Could not extract valid JSON structure after cleaning. Cleaned data: '{json_to_parse[:100]}...'")
             # Log more context on failure
             print(f"  -- Original raw response was: {raw_response_text}")
             return None

        try:
            analysis_result = json.loads(json_to_parse)
        except json.JSONDecodeError as json_err:
            print(f"  <- Analysis failed: JSONDecodeError - {json_err}. Problematic JSON string (cleaned): '{json_to_parse}'")
            # Log more context on failure
            print(f"  -- Original raw response was: {raw_response_text}")
            return None

        # --- Validate Structure ---
        # Add more verbose checks for debugging
        if not isinstance(analysis_result, dict):
            print(f"  <- Analysis failed: Parsed result is not a dictionary. Type: {type(analysis_result)}")
            return None

        # Check top-level keys
        if not all(key in analysis_result for key in ['relevance', 'headline', 'conclusion']):
            print(f"  <- Analysis failed: Missing top-level keys. Found: {list(analysis_result.keys())}")
            return None

        # Check nested structure and types
        relevance_data = analysis_result.get('relevance')
        headline_data = analysis_result.get('headline')
        conclusion_data = analysis_result.get('conclusion')

        if not isinstance(relevance_data, dict) or \
           not all(k in relevance_data for k in ['is_relevant', 'relevance_score', 'explanation']) or \
           not isinstance(relevance_data.get('is_relevant'), bool) or \
           not isinstance(relevance_data.get('relevance_score'), int) or \
           not isinstance(relevance_data.get('explanation'), str):
            print(f"  <- Analysis failed: Incorrect structure or types in 'relevance' field. Data: {relevance_data}")
            return None

        if not isinstance(headline_data, dict) or \
           'hebrew' not in headline_data or \
           not isinstance(headline_data.get('hebrew'), str):
            print(f"  <- Analysis failed: Incorrect structure or types in 'headline' field. Data: {headline_data}")
            return None

        if not isinstance(conclusion_data, dict) or \
           'hebrew' not in conclusion_data or \
           not isinstance(conclusion_data.get('hebrew'), str):
            print(f"  <- Analysis failed: Incorrect structure or types in 'conclusion' field. Data: {conclusion_data}")
            return None

        # If all checks pass
        print(
            f"  <- Analysis successful for snippet '{cleaned_snippet}...'. Score: {analysis_result.get('relevance', {}).get('relevance_score', 'N/A')}"
        )
        return analysis_result

    # --- Error Handling - Aligns with library exceptions ---
    except anthropic.APIStatusError as e:
        print(
            f"  <- Anthropic API Status Error (Analysis): Status={e.status_code} Response={e.response} for snippet '{cleaned_snippet}...'. Model: {ANALYSIS_MODEL}"
        )
        if e.status_code == 400:
            print(
                f"  <- NOTE: 400 Bad Request. Possible causes: Model name '{ANALYSIS_MODEL}' invalid/unavailable OR API parameters incorrect OR input/output token limits exceeded."
            )
        # Log relevant parts of the request if possible (be careful with sensitive data)
        print(f"  -- Failing request details: Question='{user_question[:60]}...', Paragraph Snippet='{cleaned_snippet}...', System Prompt Length={len(system_prompt)}, User Content Length={len(user_message_content)}")
        return None
    except Exception as e:
        print(
            f"  <- Unexpected error during Claude analysis API call ({type(e).__name__}) for snippet '{cleaned_snippet}...': {e}"
        )
        traceback.print_exc()
         # Log relevant parts of the request if possible
        print(f"  -- Failing request details: Question='{user_question[:60]}...', Paragraph Snippet='{cleaned_snippet}...', System Prompt Length={len(system_prompt)}, User Content Length={len(user_message_content)}")
        return None

# --- Example Usage (No changes needed) ---
# Consider adding a small async test function here if needed
# import asyncio
# async def main():
#     test_q = "מהי חשיבות השמחה בעבודת ה'?"
#     test_p = "ועיקר עבודת ה' היא בשמחה, כמו שכתוב 'עבדו את ה' בשמחה', כי השמחה פותחת הלב ומאירה הנשמה, ומביאה לידי דביקות בהשי\"ת. ועל ידי העצבות ח\"ו נסתם הלב ואינו יכול לקבל אור הקדושה."
#     result = await analyze_source_relevance_async(test_p, "", test_q)
#     print("\n--- Test Analysis Result ---")
#     print(json.dumps(result, indent=2, ensure_ascii=False))
# if __name__ == "__main__":
#     if ANTHROPIC_API_KEY:
#         asyncio.run(main())
#     else:
#         print("Cannot run test: ANTHROPIC_API_KEY not set.")