technicolor's picture
Add Django InteractiveSurvey project
a97d040
raw
history blame
7.11 kB
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
# --------------------------------------------------------------------------------
# PROMPTS & CLIENT UTILS
# --------------------------------------------------------------------------------
COVERAGE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---
<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Coverage assesses the extent to which the survey encapsulates all relevant aspects of the topic.
---
Score 1 Description: The survey has very limited coverage, only touching on a small portion of the topic and lacking discussion on key areas.
Score 2 Description: The survey covers some parts of the topic but has noticeable omissions, with significant areas either underrepresented or missing.
Score 3 Description: The survey is generally comprehensive but still misses a few key points.
Score 4 Description: The survey covers most key areas comprehensively, with only very minor topics left out.
Score 5 Description: The survey comprehensively covers all key and peripheral topics, providing detailed discussions and extensive information.
---
Return the score without any other information:
'''
STRUCTURE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---
<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Structure evaluates the logical organization and coherence of sections and subsections.
---
Score 1 Description: The survey lacks logic, with no clear connections between sections.
Score 2 Description: The survey has weak logical flow with some disordered content.
Score 3 Description: The survey has a generally reasonable logical structure.
Score 4 Description: The survey has good logical consistency, with content well arranged.
Score 5 Description: The survey is tightly structured and logically clear.
---
Return the score without any other information:
'''
RELEVANCE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---
<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Relevance measures how well the content aligns with the research topic.
---
Score 1 Description: The content is outdated or unrelated to the field.
Score 2 Description: The survey is somewhat on topic but with several digressions.
Score 3 Description: The survey is generally on topic, despite a few unrelated details.
Score 4 Description: The survey is mostly on topic and focused.
Score 5 Description: The survey is exceptionally focused and entirely on topic.
---
Return the score without any other information:
'''
def getQwenClient():
openai_api_key = os.getenv("OPENAI_API_KEY")
openai_api_base = os.getenv("OPENAI_API_BASE")
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
return client
def generateResponse(client, prompt):
chat_response = client.chat.completions.create(
model=os.environ.get("MODEL"),
max_tokens=128,
temperature=0.5,
stop="<|im_end|>",
stream=True,
messages=[{"role": "user", "content": prompt}]
)
text = ""
for chunk in chat_response:
if chunk.choices[0].delta.content:
text += chunk.choices[0].delta.content
return text
def evaluate_survey(topic, survey_content, client, prompt_template):
prompt = prompt_template.replace("[TOPIC]", topic).replace("[SURVEY]", survey_content)
response = generateResponse(client, prompt)
return response.strip()
def evaluate_coverage(topic, survey_content, client):
return evaluate_survey(topic, survey_content, client, COVERAGE_PROMPT)
def evaluate_structure(topic, survey_content, client):
return evaluate_survey(topic, survey_content, client, STRUCTURE_PROMPT)
def evaluate_relevance(topic, survey_content, client):
return evaluate_survey(topic, survey_content, client, RELEVANCE_PROMPT)
# --------------------------------------------------------------------------------
# MAIN LOGIC
# --------------------------------------------------------------------------------
if __name__ == "__main__":
client = getQwenClient()
category_folders = [
"Computer Science",
"Mathematics",
"Physics",
"Statistics",
"Electrical Engineering and Systems Science",
"Quantitative Biology",
"Quantitative Finance",
"Economics"
]
evaluation_results = {}
for category in category_folders:
if not os.path.isdir(category):
# If the folder doesn't exist, skip
print(f"Skipping: '{category}' - directory not found.")
continue
# Initialize a dict for this category in the results
evaluation_results[category] = {}
# For each .md file found in this category folder
for filename in os.listdir(category):
# We only want .md files that follow the naming pattern "survey_{topic}.md"
if filename.lower().endswith(".md") and filename.startswith("survey_"):
# Extract the topic from the filename
# e.g., "survey_LLM for In-Context Learning.md" -> "LLM for In-Context Learning"
topic = filename[len("survey_") : -len(".md")]
md_file_path = os.path.join(category, filename)
if not os.path.isfile(md_file_path):
continue
# Read the content of the survey file
with open(md_file_path, "r", encoding="utf-8") as f:
survey_content = f.read()
# Evaluate
try:
coverage_score = evaluate_coverage(topic, survey_content, client)
structure_score = evaluate_structure(topic, survey_content, client)
relevance_score = evaluate_relevance(topic, survey_content, client)
# Store in nested dictionary: results[category][topic] = ...
evaluation_results[category][topic] = {
"coverage": coverage_score,
"structure": structure_score,
"relevance": relevance_score
}
print(f"Evaluated: {category} / {topic}")
except Exception as e:
print(f"Error evaluating '{category} / {topic}': {e}")
# Write everything to a single JSON file
output_file = "evaluation_results.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(evaluation_results, f, indent=4, ensure_ascii=False)
print(f"Evaluation completed. Results saved to: {output_file}")