ferraro commited on
Commit
d04d9db
·
1 Parent(s): 8513ee1

including multi-agent debate

Browse files
Files changed (2) hide show
  1. app.py +43 -19
  2. multi_agent_debate.py +828 -0
app.py CHANGED
@@ -10,6 +10,7 @@ from prompts import templates, get_examples
10
  from typing import Any
11
  from string import Template
12
  from qa_dreamer import get_questions
 
13
 
14
  def safe_parse_json(model_answer):
15
  """.."""
@@ -67,7 +68,7 @@ def select_models():
67
  """Returns only when a valid option is selected from both dropdowns."""
68
 
69
  retriever_options = ["Choose one...", "BM25 Retriever", "Off-the-shelf Retriever", "Finetuned Retriever", "No Retriever"]
70
- reasoner_options = ["Choose one...", "Claude Sonnet", "GPT-4o", "o3-mini"]
71
  dreamer_options = ["None", "CoRE", "CoRE-Contrastive", "QA-Decomposition"]
72
 
73
  #selectboxes
@@ -275,26 +276,44 @@ def reasoner(query: str, documents: list[str], selected_dreamer: str, selected_r
275
  prompt_template = Template(templates["with_evidence"])
276
  prompt = prompt_template.substitute(claim=query, corpus_text=documents)
277
 
278
- if selected_reasoner == "Claude Sonnet":
279
- message += "Using Claude Sonnet to reason and verify the claim..."
280
- elif selected_reasoner == "GPT-4o":
281
- message += "Using GPT-4o to analyze and verify the claim in detail..."
282
- elif selected_reasoner == "o3-mini":
283
- message += "Using o3-mini to quickly analyze the claim..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
- print(prompt)
286
-
287
- llm_response = llm_client.run_inference(prompt)
288
 
289
- answer_dict = safe_parse_json(llm_response)
290
- try:
291
- decision = answer_dict.get("decision", "")
292
- reasoning = answer_dict.get("reasoning", "")
293
- except:
294
- print(f"Error with parsing the returned {answer_dict}")
295
- decision, reasoning = "", ""
 
 
296
 
297
- display_to_chat(placeholder, message)
298
  # You could return reasoning info here.
299
  return reasoning, decision
300
 
@@ -360,6 +379,11 @@ def main():
360
  options["model_family"] = "OpenAI"
361
  options["model_name"] = "o3-mini-2025-01-31"
362
 
 
 
 
 
 
363
  options["API_KEY"] = api_key
364
 
365
  llm_client = LLMReasoner(options)
@@ -474,4 +498,4 @@ def main():
474
  st.session_state.messages.append({"role": "assistant", "content": full_response})
475
 
476
  if __name__ == '__main__':
477
- main()
 
10
  from typing import Any
11
  from string import Template
12
  from qa_dreamer import get_questions
13
+ from multi_agent_debate import MultiAgentDebate
14
 
15
  def safe_parse_json(model_answer):
16
  """.."""
 
68
  """Returns only when a valid option is selected from both dropdowns."""
69
 
70
  retriever_options = ["Choose one...", "BM25 Retriever", "Off-the-shelf Retriever", "Finetuned Retriever", "No Retriever"]
71
+ reasoner_options = ["Choose one...", "Claude Sonnet", "GPT-4o", "o3-mini", "Multi Agent Debate"]
72
  dreamer_options = ["None", "CoRE", "CoRE-Contrastive", "QA-Decomposition"]
73
 
74
  #selectboxes
 
276
  prompt_template = Template(templates["with_evidence"])
277
  prompt = prompt_template.substitute(claim=query, corpus_text=documents)
278
 
279
+ if selected_reasoner == "Multi Agent Debate":
280
+ multi_agent_debate = MultiAgentDebate(client=llm_client.client)
281
+ initial_evidence = "" if (not documents or len(documents)==0) else documents[0]
282
+ def mad_printer(msg, explanation):
283
+ print(msg)
284
+ ph = st.empty()
285
+ display_to_chat(ph, msg)
286
+ if explanation is not None or len(explanation) > 0:
287
+ with st.expander("Agent's Argument (see more)"):
288
+ st.write(str(explanation))
289
+
290
+ decision = multi_agent_debate(claim=query,
291
+ doc=initial_evidence,
292
+ writer=mad_printer)
293
+ reasoning = "of the debate and discussion."
294
+ else:
295
+ if selected_reasoner == "Claude Sonnet":
296
+ message += "Using Claude Sonnet to reason and verify the claim..."
297
+ elif selected_reasoner == "GPT-4o":
298
+ message += "Using GPT-4o to analyze and verify the claim in detail..."
299
+ elif selected_reasoner == "o3-mini":
300
+ message += "Using o3-mini to quickly analyze the claim..."
301
+ elif selected_reasoner == "Multi Agent Debate":
302
+ message += "Multiple Agents will discuss and reason about the claim..."
303
 
304
+ print(prompt)
 
 
305
 
306
+ llm_response = llm_client.run_inference(prompt)
307
+
308
+ answer_dict = safe_parse_json(llm_response)
309
+ try:
310
+ decision = answer_dict.get("decision", "")
311
+ reasoning = answer_dict.get("reasoning", "")
312
+ except:
313
+ print(f"Error with parsing the returned {answer_dict}")
314
+ decision, reasoning = "", ""
315
 
316
+ display_to_chat(placeholder, message)
317
  # You could return reasoning info here.
318
  return reasoning, decision
319
 
 
379
  options["model_family"] = "OpenAI"
380
  options["model_name"] = "o3-mini-2025-01-31"
381
 
382
+ elif selected_reasoner == "Multi Agent Debate":
383
+ api_key = os.getenv("openai_key")
384
+ options["model_family"] = "OpenAI"
385
+ options["model_name"] = "gpt-4o-2024-11-20"
386
+
387
  options["API_KEY"] = api_key
388
 
389
  llm_client = LLMReasoner(options)
 
498
  st.session_state.messages.append({"role": "assistant", "content": full_response})
499
 
500
  if __name__ == '__main__':
501
+ main()
multi_agent_debate.py ADDED
@@ -0,0 +1,828 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled37.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1FbaYZ7tAm87yWo_lf87t2ynNPMR5olB-
8
+
9
+ # Prep
10
+ """
11
+
12
+ ## load helper functions
13
+ import json
14
+ import openai
15
+ import copy
16
+ import random
17
+ from openai import OpenAI
18
+ ## parsing functions
19
+ from bs4 import BeautifulSoup
20
+
21
+ class MultiAgentDebate:
22
+ def __init__(self, client=None):
23
+ if client is not None:
24
+ self.client = client
25
+ else:
26
+ self.client = self.get_client()
27
+
28
+ def get_prompt_direct_eval(self, claim):
29
+
30
+ prompt = '''
31
+ You are given a claim in the <claim></claim> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines.
32
+
33
+ <guidelines>
34
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
35
+ 2. Consider the specificity and credibility of any numbers or percentages.
36
+ 3. Analyze the context and scope of the claim.
37
+ 4. Assess any potential biases or limitations.
38
+ </guidelines>
39
+
40
+ <claim>
41
+ # %s
42
+ </claim>
43
+
44
+ Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <explanation></explanation> XML tags. Skip the preamble.
45
+ '''%(claim)
46
+ return prompt
47
+
48
+ def get_prompt_direct_eval_w_doc(self, doc, claim):
49
+
50
+ prompt = '''
51
+ You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to analyze a given claim with respect to the given evidence and decide whether the claim is supported or not. You should also consider the provided guidelines.
52
+
53
+ <guidelines>
54
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
55
+ 2. Consider the specificity and credibility of any numbers or percentages.
56
+ 3. Analyze the context and scope of the claim.
57
+ 4. Assess any potential biases or limitations.
58
+ </guidelines>
59
+
60
+ <doc>
61
+ # %s
62
+ </doc>
63
+
64
+ <claim>
65
+ # %s
66
+ </claim>
67
+
68
+ Determine if the claim is supported or not given the document as the evidence. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <explanation></explanation> XML tags. Skip the preamble.
69
+ '''%(doc,claim)
70
+ return prompt
71
+
72
+ def get_prompt_debate(self, claim, chat_history, mediator_feedback):
73
+ prompt = '''
74
+ You are given a claim in the <claim></claim> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in <chat_history></chat_history> tags below.
75
+
76
+ <guidelines>
77
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
78
+ 2. Consider the specificity and credibility of any numbers or percentages.
79
+ 3. Analyze the context and scope of the claim.
80
+ 4. Assess any potential biases or limitations.
81
+ </guidelines>
82
+
83
+ <claim>
84
+ # %s
85
+ </claim>
86
+
87
+ <chat_history>
88
+ # %s
89
+ </chat_history>
90
+
91
+ The <chat_history></chat_history> tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
92
+ '''%(claim,chat_history,mediator_feedback)
93
+ return prompt
94
+
95
+ def get_adjudicator_prompt(self, claim, chat_history):
96
+ prompt = '''
97
+ You are given a claim in the <claim></claim> tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between <chat_history></chat_history> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines.
98
+
99
+ <guidelines>
100
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
101
+ 2. Consider the specificity and credibility of any numbers or percentages.
102
+ 3. Analyze the context and scope of the claim.
103
+ 4. Assess any potential biases or limitations.
104
+ </guidelines>
105
+
106
+ <claim>
107
+ # %s
108
+ </claim>
109
+
110
+ <chat_history>
111
+ # %s
112
+ </chat_history>
113
+
114
+ Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
115
+ '''%(claim,chat_history)
116
+ #Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between <response></response> tags. Skip the preamble.
117
+ return prompt
118
+
119
+ def get_prompt_debate_w_doc(self, doc, claim, chat_history, mediator_feedback):
120
+ prompt = '''
121
+ You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in <chat_history></chat_history> tags below.
122
+
123
+ <guidelines>
124
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
125
+ 2. Consider the specificity and credibility of any numbers or percentages.
126
+ 3. Analyze the context and scope of the claim.
127
+ 4. Assess any potential biases or limitations.
128
+ </guidelines>
129
+
130
+ <doc>
131
+ # %s
132
+ </doc>
133
+
134
+ <claim>
135
+ # %s
136
+ </claim>
137
+
138
+ <chat_history>
139
+ # %s
140
+ </chat_history>
141
+
142
+ The <chat_history></chat_history> tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
143
+ '''%(doc, claim,chat_history,mediator_feedback)
144
+ return prompt
145
+
146
+ def get_adjudicator_prompt_w_doc(self, doc, claim, chat_history):
147
+ prompt = '''
148
+ You are given a claim in the <claim></claim> tags, a document as evidence in <doc></doc> tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between <chat_history></chat_history> tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines.
149
+
150
+ <guidelines>
151
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
152
+ 2. Consider the specificity and credibility of any numbers or percentages.
153
+ 3. Analyze the context and scope of the claim.
154
+ 4. Assess any potential biases or limitations.
155
+ </guidelines>
156
+
157
+ <doc>
158
+ # %s
159
+ </doc>
160
+
161
+ <claim>
162
+ # %s
163
+ </claim>
164
+
165
+ <chat_history>
166
+ # %s
167
+ </chat_history>
168
+
169
+ Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
170
+ '''%(doc, claim,chat_history)
171
+ #Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between <response></response> tags. Skip the preamble.
172
+ return prompt
173
+
174
+ def get_prompt_direct_w_causal_sub_claims(self, claim):
175
+
176
+ prompt = '''
177
+ You are given a claim in the <claim></claim> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not.
178
+ A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
179
+ If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
180
+
181
+ <guidelines>
182
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
183
+ 2. Consider the specificity and credibility of any numbers or percentages.
184
+ 3. Analyze the context and scope of the claim.
185
+ 4. Assess any potential biases or limitations.
186
+ </guidelines>
187
+
188
+ <claim>
189
+ 40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
190
+ </claim>
191
+
192
+ Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
193
+
194
+ <sub-claims>
195
+ 40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes.
196
+ 2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes.
197
+ </sub-claims>
198
+
199
+ <label>
200
+ 1
201
+ </label>
202
+
203
+ <argument>
204
+ Yes. There is a study that indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes.
205
+ </argument>
206
+
207
+ You are given a claim in the <claim></claim> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not.
208
+ A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
209
+ If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
210
+
211
+ <guidelines>
212
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
213
+ 2. Consider the specificity and credibility of any numbers or percentages.
214
+ 3. Analyze the context and scope of the claim.
215
+ 4. Assess any potential biases or limitations.
216
+ </guidelines>
217
+
218
+ <claim>
219
+ # %s
220
+ </claim>
221
+
222
+ Break the claim into causal sub-claims and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
223
+ '''%(claim)
224
+ return prompt
225
+
226
+ def get_prompt_direct_w_doc_w_causal_sub_claims(self, doc, claim):
227
+
228
+ prompt = '''
229
+ You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not.
230
+ A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
231
+ If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
232
+
233
+ <guidelines>
234
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
235
+ 2. Consider the specificity and credibility of any numbers or percentages.
236
+ 3. Analyze the context and scope of the claim.
237
+ 4. Assess any potential biases or limitations.
238
+ </guidelines>
239
+
240
+ <doc>
241
+ High plasma homocysteine levels are a risk factor for mortality and vascular disease in observational studies of patients with chronic kidney disease.", "Folic acid and B vitamins decrease homocysteine levels in this population but whether they lower mortality is unknown. \n", "OBJECTIVE To determine whether high doses of folic acid and B vitamins administered daily reduce mortality in patients with chronic kidney disease. \n", "DESIGN, SETTING, AND PARTICIPANTS Double-blind randomized controlled trial (2001-2006) in 36 US Department of Veterans Affairs medical centers.", "Median follow-up was 3.2 years for 2056 participants aged 21 years or older with advanced chronic kidney disease (estimated creatinine clearance < or =30 mL/min) (n = 1305) or end-stage renal disease (n = 751) and high homocysteine levels (> or = 15 micromol/L). \n", "INTERVENTION Participants received a daily capsule containing 40 mg of folic acid, 100 mg of pyridoxine hydrochloride (vitamin B6), and 2 mg of cyanocobalamin (vitamin B12) or a placebo. \n", "MAIN OUTCOME MEASURES The primary outcome was all-cause mortality.", "Secondary outcomes included myocardial infarction (MI), stroke, amputation of all or part of a lower extremity, a composite of these 3 plus all-cause mortality, time to initiation of dialysis, and time to thrombosis of arteriovenous access in hemodialysis patients. \n", "RESULTS Mean baseline homocysteine level was 24.0 micromol/L in the vitamin group and 24.2 micromol/L in the placebo group.", "It was lowered 6.3 micromol/L (25.8%%, P < .001) in the vitamin group and 0.4 micromol/L (1.7%%, P = .14) in the placebo group at 3 months, but there was no significant effect on mortality (448 vitamin group deaths vs 436 placebo group deaths) (hazard ratio [HR], 1.04, 95%% CI, 0.91-1.18).", "No significant effects were demonstrated for secondary outcomes or adverse events: there were 129 MIs in the vitamin group vs 150 for placebo (HR, 0.86, 95%% CI, 0.67-1.08), 37 strokes in the vitamin group vs 41 for placebo (HR, 0.90, 95%% CI, 0.58-1.40), and 60 amputations in the vitamin group vs 53 for placebo (HR, 1.14, 95%% CI, 0.79-1.64).", "In addition, the composite of MI, stroke, and amputations plus mortality (P = .85), time to dialysis (P = .38), and time to thrombosis in hemodialysis patients (P = .97) did not differ between the vitamin and placebo groups. \n", "CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n", "TRIAL REGISTRATION clinicaltrials.gov Identifier: NCT00032435."
242
+ </doc>
243
+
244
+ <claim>
245
+ 40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
246
+ </claim>
247
+
248
+ Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
249
+
250
+ <sub-claims>
251
+ 40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes.
252
+ 2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes.
253
+ </sub-claims>
254
+
255
+ <label>
256
+ 1
257
+ </label>
258
+
259
+ <argument>
260
+ Yes. The information provided indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes.
261
+ </argument>
262
+
263
+ You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not.
264
+ A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
265
+ If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.
266
+
267
+ <guidelines>
268
+ 1. Evaluate the claim's plausibility based on general medical knowledge.
269
+ 2. Consider the specificity and credibility of any numbers or percentages.
270
+ 3. Analyze the context and scope of the claim.
271
+ 4. Assess any potential biases or limitations.
272
+ </guidelines>
273
+
274
+ <doc>
275
+ # %s
276
+ </doc>
277
+
278
+ <claim>
279
+ # %s
280
+ </claim>
281
+
282
+ Break the claim into causal sub-claims and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
283
+ '''%(doc,claim)
284
+ return prompt
285
+
286
+ def parse_output_response(self, response):
287
+ soup = BeautifulSoup(response, 'html.parser')
288
+ explanation_list = soup.find_all("explanation")
289
+ explanation_text = ""
290
+ for exp in explanation_list:
291
+ if exp.string != None:
292
+ explanation_text += exp.string + ' '
293
+ else:
294
+ explanation_text = response
295
+ explanation_text = ' '.join(explanation_text.split())
296
+ if len(soup.find_all("label")) > 0:
297
+ labels = soup.find_all("label")[-1].string.strip()
298
+ else:
299
+ labels = "Unknown"
300
+ return labels, explanation_text
301
+
302
+ def parse_output_response_w_category(self, response):
303
+ soup = BeautifulSoup(response, 'html.parser')
304
+ explanation_list = soup.find_all("explanation")
305
+ explanation_text = ""
306
+ for exp in explanation_list:
307
+ if exp.string != None:
308
+ explanation_text += exp.string + ' '
309
+ else:
310
+ explanation_text = response
311
+ explanation_text = ' '.join(explanation_text.split())
312
+
313
+ category_list = soup.find_all("category")
314
+ category_text = ""
315
+ for exp in category_list:
316
+ if exp.string != None:
317
+ category_text += exp.string + ' '
318
+ else:
319
+ category_text = ""
320
+ category_text = ' '.join(category_text.split())
321
+
322
+ if len(soup.find_all("label")) > 0:
323
+ labels = soup.find_all("label")[-1].string.strip()
324
+ else:
325
+ labels = "Unknown"
326
+
327
+ return labels, category_text, explanation_text
328
+
329
+ def parse_output_w_chat_label(self, response):
330
+ soup = BeautifulSoup(response, 'html.parser')
331
+ argument_list = soup.find_all("argument")
332
+ argument_text = ""
333
+ for argument in argument_list:
334
+ if argument.string != None:
335
+ argument_text += argument.string + ' '
336
+ else:
337
+ argument_text = response
338
+ argument_text = ' '.join(argument_text.split())
339
+ if len(soup.find_all("label")) > 0:
340
+ guidelines = soup.find_all("label")[0].string.strip()
341
+ else:
342
+ guidelines = "Unknown"
343
+
344
+ return argument_text, guidelines
345
+
346
+ def parse_output_response_w_causal_subclaims(self, response):
347
+ soup = BeautifulSoup(response, 'html.parser')
348
+ argument_list = soup.find_all("argument")
349
+ argument_text = ""
350
+ for argument in argument_list:
351
+ if argument.string != None:
352
+ argument_text += argument.string + ' '
353
+ else:
354
+ argument_text = response
355
+
356
+ argument_text = ' '.join(argument_text.split())
357
+ if len(soup.find_all("label")) > 0:
358
+ label = soup.find_all("label")[0].string.strip()
359
+ else:
360
+ label = "Unknown"
361
+
362
+ sub_claims_text = ""
363
+ if len(soup.find_all("sub-claims")) > 0:
364
+ sub_claims_list = soup.find_all("sub-claims")
365
+ for claim in sub_claims_list:
366
+ if claim.string != None:
367
+ sub_claims_text += claim.string + '\n'
368
+
369
+ return label, argument_text, sub_claims_text
370
+
371
+ """# OpenAI Prep"""
372
+
373
+ def get_client(self):
374
+ self.client = OpenAI(api_key="",
375
+ organization="")
376
+ return self.client
377
+
378
+ #client = get_client()
379
+ def parse_chatgpt_api_response(self, response):
380
+ choices = response.choices
381
+ # choices = response["choices"]
382
+ main_response_message_list = []
383
+ if len(choices) > 1:
384
+ for choice in choices:
385
+ main_response = choice.message
386
+ # main_response_message, main_response_role = main_response["content"], main_response["role"]
387
+ main_response_message, main_response_role = main_response.content, main_response.role
388
+ main_response_message_list.append(main_response_message)
389
+ return main_response_message_list, response
390
+
391
+ else:
392
+ main_response = choices[0].message
393
+ # main_response_message, main_response_role = main_response["content"], main_response["role"]
394
+ main_response_message, main_response_role = main_response.content, main_response.role
395
+ return main_response_message, response
396
+
397
+ def make_openai_api_call(self, prompt, model_name, temperature):
398
+ if 'gpt-3' in model_name or'gpt-4' in model_name:
399
+ # openai.ChatCompletion.create
400
+ response = self.client.chat.completions.create(
401
+ model=model_name,
402
+ messages=[{'role': 'user', 'content': prompt}],
403
+ temperature=temperature,
404
+ max_tokens=4096,
405
+ top_p=1.0,
406
+ frequency_penalty=0.0,
407
+ presence_penalty=0.0,
408
+ n=1,
409
+ )
410
+ return self.parse_chatgpt_api_response(response)
411
+
412
+ def make_openai_api_call_o3_mini(self, prompt, model_name, temperature):
413
+ response = self.client.chat.completions.create(
414
+ model=model_name,
415
+ messages=[{'role': 'user', 'content': prompt}],
416
+ response_format={
417
+ "type": "text"
418
+ },
419
+ reasoning_effort="medium"
420
+ )
421
+ return self.parse_chatgpt_api_response(response)
422
+
423
+ def read_file(self, file_path):
424
+ all_data = []
425
+ with open(file_path, 'r') as input_file:
426
+ for line in input_file:
427
+ line = line.strip()
428
+ data = json.loads(line)
429
+ all_data.append(data)
430
+ return all_data
431
+
432
+ def safe_print(self, x, *args):
433
+ print(x)
434
+
435
+ def __call__(self, doc, claim, initialization=True, model_name='gpt-4o-mini',
436
+ writer=safe_print):
437
+ # number of simultaneous debates for evaluation
438
+ num_debates = 1
439
+ eval_repeat_max = 0
440
+
441
+ ## initilaize a dictionary to save the outputs of each separate debate
442
+ debates_dict = dict.fromkeys([0],None)
443
+ overall_ambiguity = False
444
+ initialization = initialization
445
+
446
+ ## keep starting debates until you reach the max numer of debates
447
+ while eval_repeat_max != num_debates:
448
+ ambiguous = False
449
+ results = {}
450
+ doc = doc
451
+ sent = claim
452
+
453
+ ## intial stance assignment. We use the follwoing list of utterances as the first reponse of each agent and then use
454
+ ## this as the chat history to start the debate. The default value is 4. You can change the number of agents by adding
455
+ ## more utterances
456
+
457
+ if initialization:
458
+ agents_responses = ["The claim is not refuted by evidence.", "The claim is refuted by evidence.", "The claim is not refuted by evidence.", "The claim is refuted by evidence."]
459
+ else:
460
+ agents_responses = ["","","",""]
461
+
462
+ updated_responses = []
463
+
464
+ ## to keep track of previous responses of agents and provide them in each round
465
+ message_board = ['','','','']
466
+
467
+ ## intialize a label list to keep track of agents judgements
468
+ label_list = [[1],[0],[1],[0]]
469
+ all_chats = []
470
+
471
+ ## number of rounds of debates
472
+ turns = 3
473
+
474
+ mediator_feedback = ""
475
+ ## first round of random assessment not included in the history.
476
+ round_counter = 0
477
+ if initialization:
478
+ print("ROUND %s: (This is the initialization round where agents are assigned initial stance as their beliefs.)\n"%str(round_counter+1))
479
+ for n in range(len(agents_responses)):
480
+ print("Agent %s: "%str(n+1) + agents_responses[n] + "\n")
481
+ print("----------------------------------------------------")
482
+ round_counter += 1
483
+ print("ROUND %s:\n"%str(round_counter+1))
484
+ for n in range(len(agents_responses)):
485
+ chat_history = ""
486
+ chat_history_prompt = ''
487
+ chat_history_prompt += message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
488
+ chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
489
+ other_agents_response = ""
490
+ for nn in range(len(agents_responses)):
491
+ if nn != n:
492
+ other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
493
+ chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
494
+
495
+ message_board[n] += chat_history
496
+ chat_history_prompt += other_agents_response
497
+
498
+ ## For experiments wo initial stance uncomment the following line to clear the chat history
499
+ if not initialization:
500
+ chat_history_prompt = ""
501
+
502
+ ## the parameters to prompt module include the document, the claim sentence, previous chat history and mediator feedback
503
+ ## that you can use to modify the goals of agents
504
+ if doc != "":
505
+ prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback)
506
+ else:
507
+ prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback)
508
+ argument = ""
509
+ rep_ctr = 0
510
+ label = -1
511
+ label_val = -1
512
+
513
+ ## to make sure we have enough initial diversity in responses, we repeat the following such that if the immediate
514
+ ## response is different from the assigned stance, the agent is asked to repeat its generation. The rep_ctr is used
515
+ ## to repaet 2 times before moving on to the next stage
516
+ while label!="Unknown" and label_val != label_list[n][0] and rep_ctr != 1:
517
+ llm_response, _ = self.make_openai_api_call(prompt, model_name, 1)
518
+ argument, label = self.parse_output_w_chat_label(llm_response)
519
+ strlabel = "Support" if label == 1 else "Refute"
520
+ writer("Agent %s's Assessment:\n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n")
521
+ print("***************")
522
+ rep_ctr += 1
523
+
524
+ ## the generated label might not be in correct format so we use the following to make sure the label format is correct
525
+ if label != "Unknown":
526
+ if len(label.split()) != 0 and ',' not in label.split()[0]:
527
+ label_val = float(label.split()[0])
528
+ elif len(label.split()) == 0 or ',' in label.split()[0]:
529
+ if len(label.split(',')) != 0:
530
+ label_val = float(label.split(',')[0])
531
+ else:
532
+ label_val = float(label)
533
+
534
+ if label_val >= 0.5:
535
+ label_val = 1
536
+ else:
537
+ label_val = 0
538
+
539
+ if label != "Unknown":
540
+ if len(label.split()) != 0 and ',' not in label.split()[0]:
541
+ label_val = float(label.split()[0])
542
+ elif len(label.split()) == 0 or ',' in label.split()[0]:
543
+ if len(label.split(',')) != 0:
544
+ label_val = float(label.split(',')[0])
545
+ else:
546
+ label_val = float(label)
547
+
548
+ if label_val >= 0.5:
549
+ label_list[n].append(1)
550
+ else:
551
+ label_list[n].append(0)
552
+ else:
553
+ label_list[n].append(label_list[n][-1])
554
+ argument = argument.strip()
555
+
556
+ updated_responses.append(argument)
557
+ agents_responses = copy.deepcopy(updated_responses)
558
+
559
+ ## Once the first round is generated, we start the debate among agents
560
+ message_board = ['','','','']
561
+ for ag, ag_resp in enumerate(agents_responses):
562
+ all_chats.append("Agent %s:\n"%str(ag+1) + ag_resp)
563
+
564
+ mediator_feedback = ""
565
+
566
+ ## The debate is continued for "turns" time.
567
+ for cnt in range(turns):
568
+ if len(set([lbl_list[-1] for lbl_list in label_list])) == 1:
569
+ break
570
+ print("----------------------------------------------------")
571
+ round_counter += 1
572
+ print("ROUND %s:\n"%str(round_counter+1))
573
+ updated_responses = []
574
+ for n in range(len(agents_responses)):
575
+ chat_history = ""
576
+ chat_history_prompt = ''
577
+ chat_history_prompt += message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
578
+ chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
579
+ other_agents_response = ""
580
+ for nn in range(len(agents_responses)):
581
+ if nn != n:
582
+ other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
583
+ chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
584
+
585
+ message_board[n] += chat_history
586
+ chat_history_prompt += other_agents_response
587
+
588
+ ## to shuffle the order of chat history to remove any biases caused by order of chats
589
+ new_chat_history_list = []
590
+ chat_history_prompt_list = chat_history_prompt.split('\n')
591
+ chat_history_prompt_list = [chat_hist for chat_hist in chat_history_prompt_list if chat_hist != ""]
592
+ for pq in range(0,len(chat_history_prompt_list),len(agents_responses)):
593
+ shuffled_list = chat_history_prompt_list[pq:pq+len(agents_responses)]
594
+ random.shuffle(shuffled_list)
595
+ new_chat_history_list += shuffled_list
596
+ chat_history_prompt = '\n'.join(new_chat_history_list)
597
+
598
+ ## you can add any type of feedback here and add them to prompt to improve the debate consensus
599
+ ## we do it after the first round
600
+ # if cnt >= 1:
601
+ # mediator_feedback = " Look back at the guidelines and how you have used them. Make sure all guidelines (and not only a subset of them) are satisfied in your assessment. Change your stance if you have made an error or if the other agents are more convincing."
602
+ mediator_feedback = ""
603
+
604
+ if doc != "":
605
+ prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback)
606
+ else:
607
+ prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback)
608
+ llm_response, _ = self.make_openai_api_call(prompt, model_name, 1)
609
+ # print(llm_response)
610
+ # print("***************")
611
+ argument, label = self.parse_output_w_chat_label(llm_response)
612
+ strlabel = "Support" if label == 1 else "Refute"
613
+ writer("Agent %s's Assessment: \n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n")
614
+ print("***************")
615
+ if label != "Unknown":
616
+ if len(label.split()) != 0 and ',' not in label.split()[0]:
617
+ label_val = float(label.split()[0])
618
+ elif len(label.split()) == 0 or ',' in label.split()[0]:
619
+ if len(label.split(',')) != 0:
620
+ label_val = float(label.split(',')[0])
621
+ else:
622
+ label_val = float(label)
623
+
624
+ if label_val >= 0.5:
625
+ label_list[n].append(1)
626
+ else:
627
+ label_list[n].append(0)
628
+ else:
629
+ label_list[n].append(label_list[n][-1])
630
+ argument = argument.strip()
631
+
632
+ updated_responses.append(argument)
633
+ all_chats.append('Agent %s:\n'%str(n+1) + argument)
634
+ agents_responses = copy.deepcopy(updated_responses)
635
+ if len(set([lbl_list[-1] for lbl_list in label_list])) == 1:
636
+ break
637
+
638
+ #print(label_list)
639
+ label_list_text = [["Supported" if item == 1 else "Refuted" for item in lbl] for lbl in label_list]
640
+ print('----------------------------------------------------')
641
+ for lbl in range(len(label_list_text)):
642
+ print("Agent %s trajectory:\n%s\n"%(str(lbl+1), label_list_text[lbl]))
643
+
644
+
645
+ pn_list = [lbl[-1] for lbl in label_list]
646
+ debate_arguments = copy.deepcopy(all_chats[-len(agents_responses):])
647
+
648
+ ## we record the outputs of the debate in a dictionary that was previously initialized.
649
+ ## the "change" key keeps track of the number of agents who changes their stance during debate.
650
+ ## this can be used to identify the ambiguous cases directly.
651
+ if pn_list.count(0) == pn_list.count(1):
652
+ debates_dict[eval_repeat_max] = {'change': 0, 'label': -1,'arguments': debate_arguments,'labels': label_list}
653
+
654
+ all_chats_dict = {}
655
+ for n_agents in range(len(debate_arguments)):
656
+ all_chats_dict['Agent %s:'%str(n_agents+1)] = ""
657
+
658
+ for cht_counter, cht in enumerate(debate_arguments):
659
+ all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' '
660
+
661
+ ## if there is not a winner label, we use adjudicators to decide on the final label.
662
+ ## you can use multiple adjudicators if you want to do majority voting among them.
663
+ adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict]
664
+ if doc != "":
665
+ adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input))
666
+ else:
667
+ adjudicator_prompt = self.get_adjudicator_prompt(sent, '\n'.join(adjudicator_input))
668
+ rep_counter = 0
669
+ adjudicator_label_list = []
670
+ label = ""
671
+ explanation_list = []
672
+ for i in range(1):
673
+ while label == "" and rep_counter != 5:
674
+ adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0)
675
+ label , explanation = self.parse_output_response(adjudicator_response)
676
+ explanation_list.append(explanation)
677
+ writer(label, explanation)
678
+ print('********')
679
+ if label != "Unknown":
680
+ if len(label.split()) != 0 and ',' not in label.split()[0]:
681
+ label_val = float(label.split()[0])
682
+ elif len(label.split()) == 0 or ',' in label.split()[0]:
683
+ if len(label.split(',')) != 0:
684
+ label_val = float(label.split(',')[0])
685
+ else:
686
+ label_val = float(label)
687
+ if label_val >= 0.5:
688
+ label = 1
689
+ else:
690
+ label = 0
691
+ else:
692
+ label = -1
693
+ rep_counter += 1
694
+ adjudicator_label_list.append(label)
695
+ label = ""
696
+
697
+ if adjudicator_label_list.count(1) >= adjudicator_label_list.count(0):
698
+ label = 1
699
+ else:
700
+ label = 0
701
+ debates_dict[eval_repeat_max]['label'] = label
702
+
703
+ ## if there is a winner label, we return the winner as the final label of the claim
704
+ elif pn_list.count(0) != pn_list.count(1):
705
+ if pn_list.count(1) >= pn_list.count(0):
706
+ label = 1
707
+ else:
708
+ label = 0
709
+
710
+ if len(set(pn_list)) == 1:
711
+ change = len(agents_responses)//2
712
+ else:
713
+ change = len(agents_responses)//2 - 1
714
+ debates_dict[eval_repeat_max] = {'change': change, 'label': label,'arguments': debate_arguments,'labels': label_list}
715
+ explanation_list = debate_arguments
716
+
717
+ eval_repeat_max += 1
718
+
719
+ all_label_lists = [debates_dict[item]['labels'] for item in debates_dict]
720
+
721
+ ## majority vote out of debate rounds. There is a winner for each debate and then the final winner is the one with the most votes
722
+ debates_majority_vote_list = [debates_dict[item]['label'] for item in debates_dict]
723
+ print(debates_majority_vote_list)
724
+ if debates_majority_vote_list.count(1) == num_debates or debates_majority_vote_list.count(0) == num_debates:
725
+ debate_ambiguity = False
726
+ else:
727
+ debate_ambiguity = True
728
+
729
+ if debates_majority_vote_list.count(1)> debates_majority_vote_list.count(0):
730
+ debates_majority_vote = 1
731
+ elif debates_majority_vote_list.count(1) < debates_majority_vote_list.count(0):
732
+ debates_majority_vote = 0
733
+ print(debates_majority_vote)
734
+
735
+ changes_in_debates_list = [debates_dict[item]['change'] for item in debates_dict]
736
+ if changes_in_debates_list.count(0) == num_debates:
737
+ ambiguous = "Full"
738
+ elif changes_in_debates_list.count(0) == 0:
739
+ ambiguous = "None"
740
+ else:
741
+ ambiguous = "Partial"
742
+
743
+ # if changes_in_debates_list.count(0) != num_debates:
744
+ overall_majority_list = []
745
+ for label_list in all_label_lists:
746
+ change = 0
747
+ pn_list = []
748
+ for lbl in label_list:
749
+ if lbl[0] != lbl[-1]:
750
+ change += 1
751
+ pn_list.append(lbl[-1])
752
+ overall_majority_list += pn_list
753
+
754
+ ## majority vote over all individual agents regardless of which debate they belong to
755
+ if overall_majority_list.count(1)> overall_majority_list.count(0):
756
+ overall_majority_vote = 1
757
+ elif overall_majority_list.count(1) < overall_majority_list.count(0):
758
+ overall_majority_vote = 0
759
+ else:
760
+ overall_ambiguity = True
761
+
762
+ ## if there is a winner among the agents responses, we report the majority vote
763
+ if changes_in_debates_list.count(0) != num_debates and overall_ambiguity == False:
764
+ label = overall_majority_vote
765
+ explanation_list = [debates_dict[item]['arguments'] for item in debates_dict]
766
+ adjudicator_list = []
767
+ all_arguments = [debates_dict[item]['arguments'] for item in debates_dict]
768
+
769
+ ## if there is NOT a winner among agents responses, we use adjudicators to make the final call
770
+ elif changes_in_debates_list.count(0) == num_debates or overall_ambiguity == True:
771
+ all_arguments = [debates_dict[item]['arguments'] for item in debates_dict]
772
+ all_arguments = [x for xs in all_arguments for x in xs]
773
+ all_chats_dict = {}
774
+ for n_agents in range(len(all_arguments)):
775
+ all_chats_dict['Agent %s:'%str(n_agents+1)] = ""
776
+
777
+ for cht_counter, cht in enumerate(all_arguments):
778
+ all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' '
779
+
780
+ adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict]
781
+
782
+ label_list = []
783
+ label = ""
784
+ explanation_list = []
785
+ for rep in range(3):
786
+ random.shuffle(adjudicator_input)
787
+ if doc != "":
788
+ adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input))
789
+ else:
790
+ adjudicator_prompt = get_adjudicator_prompt(sent, '\n'.join(adjudicator_input))
791
+ rep_counter = 0
792
+ while label == "" and rep_counter != 5:
793
+ adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0)
794
+ label , explanation = self.parse_output_response(adjudicator_response)
795
+ explanation_list.append(explanation)
796
+ writer(label, explanation)
797
+ print('********')
798
+ if label != "Unknown":
799
+ if len(label.split()) != 0 and ',' not in label.split()[0]:
800
+ label_val = float(label.split()[0])
801
+ elif len(label.split()) == 0 or ',' in label.split()[0]:
802
+ if len(label.split(',')) != 0:
803
+ label_val = float(label.split(',')[0])
804
+ else:
805
+ label_val = float(label)
806
+ if label_val >= 0.5:
807
+ label = 1
808
+ else:
809
+ label = 0
810
+ else:
811
+ label = -1
812
+ rep_counter += 1
813
+ label_list.append(label)
814
+ label = ""
815
+
816
+ print(label_list)
817
+ results['adjudicators'] = label_list
818
+ results['adjudicators_agree'] = len(set(label_list)) == 1
819
+ if label_list.count(1) >= label_list.count(0):
820
+ label = 1
821
+ else:
822
+ label = 0
823
+
824
+ overall_majority_vote = label
825
+ adjudicator_list = label_list
826
+
827
+ label_text = ["contradict" if debates_majority_vote == 0 else "support"]
828
+ return label_text[0]