import os
import json
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

# --------------------------------------------------------------------------------
# PROMPTS & CLIENT UTILS
# --------------------------------------------------------------------------------
COVERAGE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---

<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Coverage assesses the extent to which the survey encapsulates all relevant aspects of the topic.
---
Score 1 Description: The survey has very limited coverage, only touching on a small portion of the topic and lacking discussion on key areas.
Score 2 Description: The survey covers some parts of the topic but has noticeable omissions, with significant areas either underrepresented or missing.
Score 3 Description: The survey is generally comprehensive but still misses a few key points.
Score 4 Description: The survey covers most key areas comprehensively, with only very minor topics left out.
Score 5 Description: The survey comprehensively covers all key and peripheral topics, providing detailed discussions and extensive information.
---
Return the score without any other information:
'''

STRUCTURE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---

<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Structure evaluates the logical organization and coherence of sections and subsections.
---
Score 1 Description: The survey lacks logic, with no clear connections between sections.
Score 2 Description: The survey has weak logical flow with some disordered content.
Score 3 Description: The survey has a generally reasonable logical structure.
Score 4 Description: The survey has good logical consistency, with content well arranged.
Score 5 Description: The survey is tightly structured and logically clear.
---
Return the score without any other information:
'''

RELEVANCE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---

<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Relevance measures how well the content aligns with the research topic.
---
Score 1 Description: The content is outdated or unrelated to the field.
Score 2 Description: The survey is somewhat on topic but with several digressions.
Score 3 Description: The survey is generally on topic, despite a few unrelated details.
Score 4 Description: The survey is mostly on topic and focused.
Score 5 Description: The survey is exceptionally focused and entirely on topic.
---
Return the score without any other information:
'''

def getQwenClient(): 
    openai_api_key = os.getenv("OPENAI_API_KEY")
    openai_api_base = os.getenv("OPENAI_API_BASE")
    client = OpenAI(
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
    return client

def generateResponse(client, prompt):
    chat_response = client.chat.completions.create(
        model=os.environ.get("MODEL"),
        max_tokens=128,
        temperature=0.5,
        stop="<|im_end|>",
        stream=True,
        messages=[{"role": "user", "content": prompt}]
    )
    text = ""
    for chunk in chat_response:
        if chunk.choices[0].delta.content:
            text += chunk.choices[0].delta.content
    return text

def evaluate_survey(topic, survey_content, client, prompt_template):
    prompt = prompt_template.replace("[TOPIC]", topic).replace("[SURVEY]", survey_content)
    response = generateResponse(client, prompt)
    return response.strip()

def evaluate_coverage(topic, survey_content, client):
    return evaluate_survey(topic, survey_content, client, COVERAGE_PROMPT)

def evaluate_structure(topic, survey_content, client):
    return evaluate_survey(topic, survey_content, client, STRUCTURE_PROMPT)

def evaluate_relevance(topic, survey_content, client):
    return evaluate_survey(topic, survey_content, client, RELEVANCE_PROMPT)

# --------------------------------------------------------------------------------
# MAIN LOGIC
# --------------------------------------------------------------------------------
if __name__ == "__main__":
    client = getQwenClient()

    category_folders = [
        "Computer Science",
        "Mathematics",
        "Physics",
        "Statistics",
        "Electrical Engineering and Systems Science",
        "Quantitative Biology",
        "Quantitative Finance",
        "Economics"
    ]

    evaluation_results = {}

    for category in category_folders:
        if not os.path.isdir(category):
            # If the folder doesn't exist, skip
            print(f"Skipping: '{category}' - directory not found.")
            continue

        # Initialize a dict for this category in the results
        evaluation_results[category] = {}

        # For each .md file found in this category folder
        for filename in os.listdir(category):
            # We only want .md files that follow the naming pattern "survey_{topic}.md"
            if filename.lower().endswith(".md") and filename.startswith("survey_"):
                # Extract the topic from the filename
                # e.g., "survey_LLM for In-Context Learning.md" -> "LLM for In-Context Learning"
                topic = filename[len("survey_") : -len(".md")]

                md_file_path = os.path.join(category, filename)
                if not os.path.isfile(md_file_path):
                    continue

                # Read the content of the survey file
                with open(md_file_path, "r", encoding="utf-8") as f:
                    survey_content = f.read()

                # Evaluate
                try:
                    coverage_score = evaluate_coverage(topic, survey_content, client)
                    structure_score = evaluate_structure(topic, survey_content, client)
                    relevance_score = evaluate_relevance(topic, survey_content, client)

                    # Store in nested dictionary: results[category][topic] = ...
                    evaluation_results[category][topic] = {
                        "coverage": coverage_score,
                        "structure": structure_score,
                        "relevance": relevance_score
                    }

                    print(f"Evaluated: {category} / {topic}")
                except Exception as e:
                    print(f"Error evaluating '{category} / {topic}': {e}")

    # Write everything to a single JSON file
    output_file = "evaluation_results.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(evaluation_results, f, indent=4, ensure_ascii=False)

    print(f"Evaluation completed. Results saved to: {output_file}")