Spaces:
Sleeping
Sleeping
File size: 7,108 Bytes
a97d040 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
# --------------------------------------------------------------------------------
# PROMPTS & CLIENT UTILS
# --------------------------------------------------------------------------------
COVERAGE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---
<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Coverage assesses the extent to which the survey encapsulates all relevant aspects of the topic.
---
Score 1 Description: The survey has very limited coverage, only touching on a small portion of the topic and lacking discussion on key areas.
Score 2 Description: The survey covers some parts of the topic but has noticeable omissions, with significant areas either underrepresented or missing.
Score 3 Description: The survey is generally comprehensive but still misses a few key points.
Score 4 Description: The survey covers most key areas comprehensively, with only very minor topics left out.
Score 5 Description: The survey comprehensively covers all key and peripheral topics, providing detailed discussions and extensive information.
---
Return the score without any other information:
'''
STRUCTURE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---
<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Structure evaluates the logical organization and coherence of sections and subsections.
---
Score 1 Description: The survey lacks logic, with no clear connections between sections.
Score 2 Description: The survey has weak logical flow with some disordered content.
Score 3 Description: The survey has a generally reasonable logical structure.
Score 4 Description: The survey has good logical consistency, with content well arranged.
Score 5 Description: The survey is tightly structured and logically clear.
---
Return the score without any other information:
'''
RELEVANCE_PROMPT = '''
Here is an academic survey about the topic "[TOPIC]":
---
[SURVEY]
---
<instruction>
Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
---
Criterion Description: Relevance measures how well the content aligns with the research topic.
---
Score 1 Description: The content is outdated or unrelated to the field.
Score 2 Description: The survey is somewhat on topic but with several digressions.
Score 3 Description: The survey is generally on topic, despite a few unrelated details.
Score 4 Description: The survey is mostly on topic and focused.
Score 5 Description: The survey is exceptionally focused and entirely on topic.
---
Return the score without any other information:
'''
def getQwenClient():
openai_api_key = os.getenv("OPENAI_API_KEY")
openai_api_base = os.getenv("OPENAI_API_BASE")
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
return client
def generateResponse(client, prompt):
chat_response = client.chat.completions.create(
model=os.environ.get("MODEL"),
max_tokens=128,
temperature=0.5,
stop="<|im_end|>",
stream=True,
messages=[{"role": "user", "content": prompt}]
)
text = ""
for chunk in chat_response:
if chunk.choices[0].delta.content:
text += chunk.choices[0].delta.content
return text
def evaluate_survey(topic, survey_content, client, prompt_template):
prompt = prompt_template.replace("[TOPIC]", topic).replace("[SURVEY]", survey_content)
response = generateResponse(client, prompt)
return response.strip()
def evaluate_coverage(topic, survey_content, client):
return evaluate_survey(topic, survey_content, client, COVERAGE_PROMPT)
def evaluate_structure(topic, survey_content, client):
return evaluate_survey(topic, survey_content, client, STRUCTURE_PROMPT)
def evaluate_relevance(topic, survey_content, client):
return evaluate_survey(topic, survey_content, client, RELEVANCE_PROMPT)
# --------------------------------------------------------------------------------
# MAIN LOGIC
# --------------------------------------------------------------------------------
if __name__ == "__main__":
client = getQwenClient()
category_folders = [
"Computer Science",
"Mathematics",
"Physics",
"Statistics",
"Electrical Engineering and Systems Science",
"Quantitative Biology",
"Quantitative Finance",
"Economics"
]
evaluation_results = {}
for category in category_folders:
if not os.path.isdir(category):
# If the folder doesn't exist, skip
print(f"Skipping: '{category}' - directory not found.")
continue
# Initialize a dict for this category in the results
evaluation_results[category] = {}
# For each .md file found in this category folder
for filename in os.listdir(category):
# We only want .md files that follow the naming pattern "survey_{topic}.md"
if filename.lower().endswith(".md") and filename.startswith("survey_"):
# Extract the topic from the filename
# e.g., "survey_LLM for In-Context Learning.md" -> "LLM for In-Context Learning"
topic = filename[len("survey_") : -len(".md")]
md_file_path = os.path.join(category, filename)
if not os.path.isfile(md_file_path):
continue
# Read the content of the survey file
with open(md_file_path, "r", encoding="utf-8") as f:
survey_content = f.read()
# Evaluate
try:
coverage_score = evaluate_coverage(topic, survey_content, client)
structure_score = evaluate_structure(topic, survey_content, client)
relevance_score = evaluate_relevance(topic, survey_content, client)
# Store in nested dictionary: results[category][topic] = ...
evaluation_results[category][topic] = {
"coverage": coverage_score,
"structure": structure_score,
"relevance": relevance_score
}
print(f"Evaluated: {category} / {topic}")
except Exception as e:
print(f"Error evaluating '{category} / {topic}': {e}")
# Write everything to a single JSON file
output_file = "evaluation_results.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(evaluation_results, f, indent=4, ensure_ascii=False)
print(f"Evaluation completed. Results saved to: {output_file}")
|