Spaces:
Running
Running
File size: 47,528 Bytes
d04d9db 1f97642 d04d9db 1f97642 d04d9db 1f97642 d04d9db 13fe399 d04d9db 13fe399 d04d9db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 |
# -*- coding: utf-8 -*-
"""Untitled37.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1FbaYZ7tAm87yWo_lf87t2ynNPMR5olB-
# Prep
"""
## load helper functions
import json
import openai
import copy
import random
from openai import OpenAI
## parsing functions
from bs4 import BeautifulSoup
class MultiAgentDebate:
def __init__(self, client=None):
if client is not None:
self.client = client
else:
self.client = self.get_client()
def get_prompt_direct_eval(self, claim):
prompt = '''
You are given a claim in the <claim></claim> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<claim>
# %s
</claim>
Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <explanation></explanation> XML tags. Skip the preamble.
'''%(claim)
return prompt
def get_prompt_direct_eval_w_doc(self, doc, claim):
prompt = '''
You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to analyze a given claim with respect to the given evidence and decide whether the claim is supported or not. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<doc>
# %s
</doc>
<claim>
# %s
</claim>
Determine if the claim is supported or not given the document as the evidence. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <explanation></explanation> XML tags. Skip the preamble.
'''%(doc,claim)
return prompt
def get_prompt_debate(self, claim, chat_history, mediator_feedback):
prompt = '''
You are given a claim in the <claim></claim> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in <chat_history></chat_history> tags below.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<claim>
# %s
</claim>
<chat_history>
# %s
</chat_history>
The <chat_history></chat_history> tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
'''%(claim,chat_history,mediator_feedback)
return prompt
def get_adjudicator_prompt(self, claim, chat_history):
prompt = '''
You are given a claim in the <claim></claim> tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between <chat_history></chat_history> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<claim>
# %s
</claim>
<chat_history>
# %s
</chat_history>
Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
'''%(claim,chat_history)
#Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between <response></response> tags. Skip the preamble.
return prompt
def get_prompt_debate_w_doc(self, doc, claim, chat_history, mediator_feedback):
prompt = '''
You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in <chat_history></chat_history> tags below.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<doc>
# %s
</doc>
<claim>
# %s
</claim>
<chat_history>
# %s
</chat_history>
The <chat_history></chat_history> tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
'''%(doc, claim,chat_history,mediator_feedback)
return prompt
def get_adjudicator_prompt_w_doc(self, doc, claim, chat_history):
prompt = '''
You are given a claim in the <claim></claim> tags, a document as evidence in <doc></doc> tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between <chat_history></chat_history> tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<doc>
# %s
</doc>
<claim>
# %s
</claim>
<chat_history>
# %s
</chat_history>
Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
'''%(doc, claim,chat_history)
#Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between <response></response> tags. Skip the preamble.
return prompt
def get_prompt_direct_w_causal_sub_claims(self, claim):
prompt = '''
You are given a claim in the <claim></claim> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not.
A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<claim>
40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
</claim>
Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
<sub-claims>
40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes.
2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes.
</sub-claims>
<label>
1
</label>
<argument>
Yes. There is a study that indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes.
</argument>
You are given a claim in the <claim></claim> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not.
A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<claim>
# %s
</claim>
Break the claim into causal sub-claims and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
'''%(claim)
return prompt
def get_prompt_direct_w_doc_w_causal_sub_claims(self, doc, claim):
prompt = '''
You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not.
A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<doc>
High plasma homocysteine levels are a risk factor for mortality and vascular disease in observational studies of patients with chronic kidney disease.", "Folic acid and B vitamins decrease homocysteine levels in this population but whether they lower mortality is unknown. \n", "OBJECTIVE To determine whether high doses of folic acid and B vitamins administered daily reduce mortality in patients with chronic kidney disease. \n", "DESIGN, SETTING, AND PARTICIPANTS Double-blind randomized controlled trial (2001-2006) in 36 US Department of Veterans Affairs medical centers.", "Median follow-up was 3.2 years for 2056 participants aged 21 years or older with advanced chronic kidney disease (estimated creatinine clearance < or =30 mL/min) (n = 1305) or end-stage renal disease (n = 751) and high homocysteine levels (> or = 15 micromol/L). \n", "INTERVENTION Participants received a daily capsule containing 40 mg of folic acid, 100 mg of pyridoxine hydrochloride (vitamin B6), and 2 mg of cyanocobalamin (vitamin B12) or a placebo. \n", "MAIN OUTCOME MEASURES The primary outcome was all-cause mortality.", "Secondary outcomes included myocardial infarction (MI), stroke, amputation of all or part of a lower extremity, a composite of these 3 plus all-cause mortality, time to initiation of dialysis, and time to thrombosis of arteriovenous access in hemodialysis patients. \n", "RESULTS Mean baseline homocysteine level was 24.0 micromol/L in the vitamin group and 24.2 micromol/L in the placebo group.", "It was lowered 6.3 micromol/L (25.8%%, P < .001) in the vitamin group and 0.4 micromol/L (1.7%%, P = .14) in the placebo group at 3 months, but there was no significant effect on mortality (448 vitamin group deaths vs 436 placebo group deaths) (hazard ratio [HR], 1.04, 95%% CI, 0.91-1.18).", "No significant effects were demonstrated for secondary outcomes or adverse events: there were 129 MIs in the vitamin group vs 150 for placebo (HR, 0.86, 95%% CI, 0.67-1.08), 37 strokes in the vitamin group vs 41 for placebo (HR, 0.90, 95%% CI, 0.58-1.40), and 60 amputations in the vitamin group vs 53 for placebo (HR, 1.14, 95%% CI, 0.79-1.64).", "In addition, the composite of MI, stroke, and amputations plus mortality (P = .85), time to dialysis (P = .38), and time to thrombosis in hemodialysis patients (P = .97) did not differ between the vitamin and placebo groups. \n", "CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n", "TRIAL REGISTRATION clinicaltrials.gov Identifier: NCT00032435."
</doc>
<claim>
40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
</claim>
Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
<sub-claims>
40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes.
2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes.
</sub-claims>
<label>
1
</label>
<argument>
Yes. The information provided indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes.
</argument>
You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not.
A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
<guidelines>
1. Evaluate the claim's plausibility based on general medical knowledge.
2. Consider the specificity and credibility of any numbers or percentages.
3. Analyze the context and scope of the claim.
4. Assess any potential biases or limitations.
</guidelines>
<doc>
# %s
</doc>
<claim>
# %s
</claim>
Break the claim into causal sub-claims and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
'''%(doc,claim)
return prompt
def parse_output_response(self, response):
soup = BeautifulSoup(response, 'html.parser')
explanation_list = soup.find_all("explanation")
explanation_text = ""
for exp in explanation_list:
if exp.string != None:
explanation_text += exp.string + ' '
else:
explanation_text = response
explanation_text = ' '.join(explanation_text.split())
if len(soup.find_all("label")) > 0:
labels = soup.find_all("label")[-1].string.strip()
else:
labels = "Unknown"
return labels, explanation_text
def parse_output_response_w_category(self, response):
soup = BeautifulSoup(response, 'html.parser')
explanation_list = soup.find_all("explanation")
explanation_text = ""
for exp in explanation_list:
if exp.string != None:
explanation_text += exp.string + ' '
else:
explanation_text = response
explanation_text = ' '.join(explanation_text.split())
category_list = soup.find_all("category")
category_text = ""
for exp in category_list:
if exp.string != None:
category_text += exp.string + ' '
else:
category_text = ""
category_text = ' '.join(category_text.split())
if len(soup.find_all("label")) > 0:
labels = soup.find_all("label")[-1].string.strip()
else:
labels = "Unknown"
return labels, category_text, explanation_text
def parse_output_w_chat_label(self, response):
soup = BeautifulSoup(response, 'html.parser')
argument_list = soup.find_all("argument")
argument_text = ""
for argument in argument_list:
if argument.string != None:
argument_text += argument.string + ' '
else:
argument_text = response
argument_text = ' '.join(argument_text.split())
if len(soup.find_all("label")) > 0:
guidelines = soup.find_all("label")[0].string.strip()
else:
guidelines = "Unknown"
return argument_text, guidelines
def parse_output_response_w_causal_subclaims(self, response):
soup = BeautifulSoup(response, 'html.parser')
argument_list = soup.find_all("argument")
argument_text = ""
for argument in argument_list:
if argument.string != None:
argument_text += argument.string + ' '
else:
argument_text = response
argument_text = ' '.join(argument_text.split())
if len(soup.find_all("label")) > 0:
label = soup.find_all("label")[0].string.strip()
else:
label = "Unknown"
sub_claims_text = ""
if len(soup.find_all("sub-claims")) > 0:
sub_claims_list = soup.find_all("sub-claims")
for claim in sub_claims_list:
if claim.string != None:
sub_claims_text += claim.string + '\n'
return label, argument_text, sub_claims_text
"""# OpenAI Prep"""
def get_client(self):
self.client = OpenAI(api_key="",
organization="")
return self.client
#client = get_client()
def parse_chatgpt_api_response(self, response):
choices = response.choices
# choices = response["choices"]
main_response_message_list = []
if len(choices) > 1:
for choice in choices:
main_response = choice.message
# main_response_message, main_response_role = main_response["content"], main_response["role"]
main_response_message, main_response_role = main_response.content, main_response.role
main_response_message_list.append(main_response_message)
return main_response_message_list, response
else:
main_response = choices[0].message
# main_response_message, main_response_role = main_response["content"], main_response["role"]
main_response_message, main_response_role = main_response.content, main_response.role
return main_response_message, response
def make_openai_api_call(self, prompt, model_name, temperature):
if 'gpt-3' in model_name or'gpt-4' in model_name:
# openai.ChatCompletion.create
response = self.client.chat.completions.create(
model=model_name,
messages=[{'role': 'user', 'content': prompt}],
temperature=temperature,
max_tokens=4096,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0,
n=1,
)
return self.parse_chatgpt_api_response(response)
def make_openai_api_call_o3_mini(self, prompt, model_name, temperature):
response = self.client.chat.completions.create(
model=model_name,
messages=[{'role': 'user', 'content': prompt}],
response_format={
"type": "text"
},
reasoning_effort="medium"
)
return self.parse_chatgpt_api_response(response)
def read_file(self, file_path):
all_data = []
with open(file_path, 'r') as input_file:
for line in input_file:
line = line.strip()
data = json.loads(line)
all_data.append(data)
return all_data
def safe_print(self, x, *args):
print(x)
def __call__(self, doc, claim, initialization=True, model_name='gpt-4o-mini',
initial_agent_responses=None,
writer=safe_print):
# number of simultaneous debates for evaluation
num_debates = 1
eval_repeat_max = 0
## initilaize a dictionary to save the outputs of each separate debate
debates_dict = dict.fromkeys([0],None)
overall_ambiguity = False
initialization = initialization
## keep starting debates until you reach the max numer of debates
while eval_repeat_max != num_debates:
ambiguous = False
results = {}
doc = doc
sent = claim
## intial stance assignment. We use the follwoing list of utterances as the first reponse of each agent and then use
## this as the chat history to start the debate. The default value is 4. You can change the number of agents by adding
## more utterances
if initialization:
if initial_agent_responses is None:
agents_responses = ["The claim is not refuted by evidence.", "The claim is refuted by evidence.", "The claim is not refuted by evidence.", "The claim is refuted by evidence."]
else:
agents_responses = []
for n in range(4):
if n < len(initial_agent_responses):
agents_responses.append(initial_agent_responses[n])
else:
if n % 2 == 0:
agents_responses.append("The claim is not refuted by evidence.")
else:
agents_responses.append("The claim is refuted by evidence.")
else:
agents_responses = ["","","",""]
updated_responses = []
## to keep track of previous responses of agents and provide them in each round
message_board = ['','','','']
## intialize a label list to keep track of agents judgements
label_list = [[1],[0],[1],[0]]
all_chats = []
## number of rounds of debates
turns = 3
mediator_feedback = ""
## first round of random assessment not included in the history.
round_counter = 0
if initialization:
print("ROUND %s: (This is the initialization round where agents are assigned initial stance as their beliefs.)\n"%str(round_counter+1))
for n in range(len(agents_responses)):
writer("Agent %s: "%str(n+1) + agents_responses[n] + "\n",
"This is my initial belief.")
print("----------------------------------------------------")
round_counter += 1
print("ROUND %s:\n"%str(round_counter+1))
for n in range(len(agents_responses)):
chat_history = ""
chat_history_prompt = ''
chat_history_prompt += message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
other_agents_response = ""
for nn in range(len(agents_responses)):
if nn != n:
other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
message_board[n] += chat_history
chat_history_prompt += other_agents_response
## For experiments wo initial stance uncomment the following line to clear the chat history
if not initialization:
chat_history_prompt = ""
## the parameters to prompt module include the document, the claim sentence, previous chat history and mediator feedback
## that you can use to modify the goals of agents
if doc != "":
prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback)
else:
prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback)
argument = ""
rep_ctr = 0
label = -1
label_val = -1
## to make sure we have enough initial diversity in responses, we repeat the following such that if the immediate
## response is different from the assigned stance, the agent is asked to repeat its generation. The rep_ctr is used
## to repaet 2 times before moving on to the next stage
while label!="Unknown" and label_val != label_list[n][0] and rep_ctr != 1:
llm_response, _ = self.make_openai_api_call(prompt, model_name, 1)
argument, label = self.parse_output_w_chat_label(llm_response)
print(f">>>>>>>\n\t{label}\n")
strlabel = "Support" if label == "1" else "Refute"
writer("Agent %s's Assessment:\n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n")
print("***************")
rep_ctr += 1
## the generated label might not be in correct format so we use the following to make sure the label format is correct
if label != "Unknown":
if len(label.split()) != 0 and ',' not in label.split()[0]:
label_val = float(label.split()[0])
elif len(label.split()) == 0 or ',' in label.split()[0]:
if len(label.split(',')) != 0:
label_val = float(label.split(',')[0])
else:
label_val = float(label)
if label_val >= 0.5:
label_val = 1
else:
label_val = 0
if label != "Unknown":
if len(label.split()) != 0 and ',' not in label.split()[0]:
label_val = float(label.split()[0])
elif len(label.split()) == 0 or ',' in label.split()[0]:
if len(label.split(',')) != 0:
label_val = float(label.split(',')[0])
else:
label_val = float(label)
if label_val >= 0.5:
label_list[n].append(1)
else:
label_list[n].append(0)
else:
label_list[n].append(label_list[n][-1])
argument = argument.strip()
updated_responses.append(argument)
agents_responses = copy.deepcopy(updated_responses)
## Once the first round is generated, we start the debate among agents
message_board = ['','','','']
for ag, ag_resp in enumerate(agents_responses):
all_chats.append("Agent %s:\n"%str(ag+1) + ag_resp)
mediator_feedback = ""
## The debate is continued for "turns" time.
for cnt in range(turns):
if len(set([lbl_list[-1] for lbl_list in label_list])) == 1:
break
print("----------------------------------------------------")
round_counter += 1
print("ROUND %s:\n"%str(round_counter+1))
updated_responses = []
for n in range(len(agents_responses)):
chat_history = ""
chat_history_prompt = ''
chat_history_prompt += message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
other_agents_response = ""
for nn in range(len(agents_responses)):
if nn != n:
other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
message_board[n] += chat_history
chat_history_prompt += other_agents_response
## to shuffle the order of chat history to remove any biases caused by order of chats
new_chat_history_list = []
chat_history_prompt_list = chat_history_prompt.split('\n')
chat_history_prompt_list = [chat_hist for chat_hist in chat_history_prompt_list if chat_hist != ""]
for pq in range(0,len(chat_history_prompt_list),len(agents_responses)):
shuffled_list = chat_history_prompt_list[pq:pq+len(agents_responses)]
random.shuffle(shuffled_list)
new_chat_history_list += shuffled_list
chat_history_prompt = '\n'.join(new_chat_history_list)
## you can add any type of feedback here and add them to prompt to improve the debate consensus
## we do it after the first round
# if cnt >= 1:
# mediator_feedback = " Look back at the guidelines and how you have used them. Make sure all guidelines (and not only a subset of them) are satisfied in your assessment. Change your stance if you have made an error or if the other agents are more convincing."
mediator_feedback = ""
if doc != "":
prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback)
else:
prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback)
llm_response, _ = self.make_openai_api_call(prompt, model_name, 1)
# print(llm_response)
# print("***************")
argument, label = self.parse_output_w_chat_label(llm_response)
strlabel = "Support" if label == "1" else "Refute"
writer("Agent %s's Assessment: \n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n")
print("***************")
if label != "Unknown":
if len(label.split()) != 0 and ',' not in label.split()[0]:
label_val = float(label.split()[0])
elif len(label.split()) == 0 or ',' in label.split()[0]:
if len(label.split(',')) != 0:
label_val = float(label.split(',')[0])
else:
label_val = float(label)
if label_val >= 0.5:
label_list[n].append(1)
else:
label_list[n].append(0)
else:
label_list[n].append(label_list[n][-1])
argument = argument.strip()
updated_responses.append(argument)
all_chats.append('Agent %s:\n'%str(n+1) + argument)
agents_responses = copy.deepcopy(updated_responses)
if len(set([lbl_list[-1] for lbl_list in label_list])) == 1:
break
#print(label_list)
label_list_text = [["Supported" if item == 1 else "Refuted" for item in lbl] for lbl in label_list]
print('----------------------------------------------------')
for lbl in range(len(label_list_text)):
print("Agent %s trajectory:\n%s\n"%(str(lbl+1), label_list_text[lbl]))
pn_list = [lbl[-1] for lbl in label_list]
debate_arguments = copy.deepcopy(all_chats[-len(agents_responses):])
## we record the outputs of the debate in a dictionary that was previously initialized.
## the "change" key keeps track of the number of agents who changes their stance during debate.
## this can be used to identify the ambiguous cases directly.
if pn_list.count(0) == pn_list.count(1):
debates_dict[eval_repeat_max] = {'change': 0, 'label': -1,'arguments': debate_arguments,'labels': label_list}
all_chats_dict = {}
for n_agents in range(len(debate_arguments)):
all_chats_dict['Agent %s:'%str(n_agents+1)] = ""
for cht_counter, cht in enumerate(debate_arguments):
all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' '
## if there is not a winner label, we use adjudicators to decide on the final label.
## you can use multiple adjudicators if you want to do majority voting among them.
adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict]
if doc != "":
adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input))
else:
adjudicator_prompt = self.get_adjudicator_prompt(sent, '\n'.join(adjudicator_input))
rep_counter = 0
adjudicator_label_list = []
label = ""
explanation_list = []
for i in range(1):
while label == "" and rep_counter != 5:
adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0)
label , explanation = self.parse_output_response(adjudicator_response)
explanation_list.append(explanation)
writer(label, explanation)
print('********')
if label != "Unknown":
if len(label.split()) != 0 and ',' not in label.split()[0]:
label_val = float(label.split()[0])
elif len(label.split()) == 0 or ',' in label.split()[0]:
if len(label.split(',')) != 0:
label_val = float(label.split(',')[0])
else:
label_val = float(label)
if label_val >= 0.5:
label = 1
else:
label = 0
else:
label = -1
rep_counter += 1
adjudicator_label_list.append(label)
label = ""
if adjudicator_label_list.count(1) >= adjudicator_label_list.count(0):
label = 1
else:
label = 0
debates_dict[eval_repeat_max]['label'] = label
## if there is a winner label, we return the winner as the final label of the claim
elif pn_list.count(0) != pn_list.count(1):
if pn_list.count(1) >= pn_list.count(0):
label = 1
else:
label = 0
if len(set(pn_list)) == 1:
change = len(agents_responses)//2
else:
change = len(agents_responses)//2 - 1
debates_dict[eval_repeat_max] = {'change': change, 'label': label,'arguments': debate_arguments,'labels': label_list}
explanation_list = debate_arguments
eval_repeat_max += 1
all_label_lists = [debates_dict[item]['labels'] for item in debates_dict]
## majority vote out of debate rounds. There is a winner for each debate and then the final winner is the one with the most votes
debates_majority_vote_list = [debates_dict[item]['label'] for item in debates_dict]
print(debates_majority_vote_list)
if debates_majority_vote_list.count(1) == num_debates or debates_majority_vote_list.count(0) == num_debates:
debate_ambiguity = False
else:
debate_ambiguity = True
if debates_majority_vote_list.count(1)> debates_majority_vote_list.count(0):
debates_majority_vote = 1
elif debates_majority_vote_list.count(1) < debates_majority_vote_list.count(0):
debates_majority_vote = 0
print(debates_majority_vote)
changes_in_debates_list = [debates_dict[item]['change'] for item in debates_dict]
if changes_in_debates_list.count(0) == num_debates:
ambiguous = "Full"
elif changes_in_debates_list.count(0) == 0:
ambiguous = "None"
else:
ambiguous = "Partial"
# if changes_in_debates_list.count(0) != num_debates:
overall_majority_list = []
for label_list in all_label_lists:
change = 0
pn_list = []
for lbl in label_list:
if lbl[0] != lbl[-1]:
change += 1
pn_list.append(lbl[-1])
overall_majority_list += pn_list
## majority vote over all individual agents regardless of which debate they belong to
if overall_majority_list.count(1)> overall_majority_list.count(0):
overall_majority_vote = 1
elif overall_majority_list.count(1) < overall_majority_list.count(0):
overall_majority_vote = 0
else:
overall_ambiguity = True
## if there is a winner among the agents responses, we report the majority vote
if changes_in_debates_list.count(0) != num_debates and overall_ambiguity == False:
label = overall_majority_vote
explanation_list = [debates_dict[item]['arguments'] for item in debates_dict]
adjudicator_list = []
all_arguments = [debates_dict[item]['arguments'] for item in debates_dict]
## if there is NOT a winner among agents responses, we use adjudicators to make the final call
elif changes_in_debates_list.count(0) == num_debates or overall_ambiguity == True:
all_arguments = [debates_dict[item]['arguments'] for item in debates_dict]
all_arguments = [x for xs in all_arguments for x in xs]
all_chats_dict = {}
for n_agents in range(len(all_arguments)):
all_chats_dict['Agent %s:'%str(n_agents+1)] = ""
for cht_counter, cht in enumerate(all_arguments):
all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' '
adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict]
label_list = []
label = ""
explanation_list = []
for rep in range(3):
random.shuffle(adjudicator_input)
if doc != "":
adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input))
else:
adjudicator_prompt = get_adjudicator_prompt(sent, '\n'.join(adjudicator_input))
rep_counter = 0
while label == "" and rep_counter != 5:
adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0)
label , explanation = self.parse_output_response(adjudicator_response)
explanation_list.append(explanation)
writer(label, explanation)
print('********')
if label != "Unknown":
if len(label.split()) != 0 and ',' not in label.split()[0]:
label_val = float(label.split()[0])
elif len(label.split()) == 0 or ',' in label.split()[0]:
if len(label.split(',')) != 0:
label_val = float(label.split(',')[0])
else:
label_val = float(label)
if label_val >= 0.5:
label = 1
else:
label = 0
else:
label = -1
rep_counter += 1
label_list.append(label)
label = ""
print(label_list)
results['adjudicators'] = label_list
results['adjudicators_agree'] = len(set(label_list)) == 1
if label_list.count(1) >= label_list.count(0):
label = 1
else:
label = 0
overall_majority_vote = label
adjudicator_list = label_list
label_text = ["contradict" if debates_majority_vote == 0 else "support"]
return label_text[0]
|