File size: 47,528 Bytes
d04d9db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f97642
d04d9db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f97642
 
 
 
 
 
 
 
 
 
 
 
 
d04d9db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f97642
 
d04d9db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe399
 
d04d9db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe399
d04d9db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
# -*- coding: utf-8 -*-
"""Untitled37.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1FbaYZ7tAm87yWo_lf87t2ynNPMR5olB-

# Prep
"""

## load helper functions
import json
import openai
import copy
import random
from openai import OpenAI
## parsing functions
from bs4 import BeautifulSoup
    
class MultiAgentDebate:
    def __init__(self, client=None):
        if client is not None:
            self.client = client
        else:
            self.client = self.get_client()

    def get_prompt_direct_eval(self, claim):

        prompt = '''
        You are given a claim in the <claim></claim> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <claim>
    #     %s
        </claim>

        Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <explanation></explanation> XML tags. Skip the preamble.
        '''%(claim)
        return prompt

    def get_prompt_direct_eval_w_doc(self, doc, claim):

        prompt = '''
        You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to analyze a given claim with respect to the given evidence and decide whether the claim is supported or not. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <doc>
    #     %s
        </doc>

        <claim>
    #     %s
        </claim>

        Determine if the claim is supported or not given the document as the evidence. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <explanation></explanation> XML tags. Skip the preamble.
        '''%(doc,claim)
        return prompt

    def get_prompt_debate(self, claim, chat_history, mediator_feedback):
        prompt = '''
        You are given a claim in the <claim></claim> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in <chat_history></chat_history> tags below.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <claim>
    #     %s
        </claim>

        <chat_history>
    #     %s
        </chat_history>

        The <chat_history></chat_history> tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
        '''%(claim,chat_history,mediator_feedback)
        return prompt

    def get_adjudicator_prompt(self, claim, chat_history):
        prompt = '''
        You are given a claim in the <claim></claim> tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between <chat_history></chat_history> tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <claim>
    #     %s
        </claim>

        <chat_history>
    #     %s
        </chat_history>

        Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
        '''%(claim,chat_history)
        #Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between <response></response> tags. Skip the preamble.
        return prompt

    def get_prompt_debate_w_doc(self, doc, claim, chat_history, mediator_feedback):
        prompt = '''
        You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in <chat_history></chat_history> tags below.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <doc>
    #     %s
        </doc>

        <claim>
    #     %s
        </claim>

        <chat_history>
    #     %s
        </chat_history>

        The <chat_history></chat_history> tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
        '''%(doc, claim,chat_history,mediator_feedback)
        return prompt

    def get_adjudicator_prompt_w_doc(self, doc, claim, chat_history):
        prompt = '''
        You are given a claim in the <claim></claim> tags, a document as evidence in <doc></doc> tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between <chat_history></chat_history> tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <doc>
    #     %s
        </doc>

        <claim>
    #     %s
        </claim>

        <chat_history>
    #     %s
        </chat_history>

        Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your arguments in <argument ></argument> XML tags. Skip the preamble.
        '''%(doc, claim,chat_history)
        #Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between <response></response> tags. Skip the preamble.
        return prompt

    def get_prompt_direct_w_causal_sub_claims(self, claim):

        prompt = '''
        You are given a claim in the <claim></claim> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not.
        A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
        If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <claim>
        40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
        </claim>

        Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.

        <sub-claims>
        40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes.
        2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes.
        </sub-claims>

        <label>
        1
        </label>

        <argument>
        Yes. There is a study that indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes.
        </argument>

        You are given a claim in the <claim></claim> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not.
        A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
        If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <claim>
    #     %s
        </claim>

        Break the claim into causal sub-claims and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
        '''%(claim)
        return prompt

    def get_prompt_direct_w_doc_w_causal_sub_claims(self, doc, claim):

        prompt = '''
        You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not.
        A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
        If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <doc>
        High plasma homocysteine levels are a risk factor for mortality and vascular disease in observational studies of patients with chronic kidney disease.", "Folic acid and B vitamins decrease homocysteine levels in this population but whether they lower mortality is unknown. \n", "OBJECTIVE To determine whether high doses of folic acid and B vitamins administered daily reduce mortality in patients with chronic kidney disease. \n", "DESIGN, SETTING, AND PARTICIPANTS Double-blind randomized controlled trial (2001-2006) in 36 US Department of Veterans Affairs medical centers.", "Median follow-up was 3.2 years for 2056 participants aged 21 years or older with advanced chronic kidney disease (estimated creatinine clearance < or =30 mL/min) (n = 1305) or end-stage renal disease (n = 751) and high homocysteine levels (> or = 15 micromol/L). \n", "INTERVENTION Participants received a daily capsule containing 40 mg of folic acid, 100 mg of pyridoxine hydrochloride (vitamin B6), and 2 mg of cyanocobalamin (vitamin B12) or a placebo. \n", "MAIN OUTCOME MEASURES The primary outcome was all-cause mortality.", "Secondary outcomes included myocardial infarction (MI), stroke, amputation of all or part of a lower extremity, a composite of these 3 plus all-cause mortality, time to initiation of dialysis, and time to thrombosis of arteriovenous access in hemodialysis patients. \n", "RESULTS Mean baseline homocysteine level was 24.0 micromol/L in the vitamin group and 24.2 micromol/L in the placebo group.", "It was lowered 6.3 micromol/L (25.8%%, P < .001) in the vitamin group and 0.4 micromol/L (1.7%%, P = .14) in the placebo group at 3 months, but there was no significant effect on mortality (448 vitamin group deaths vs 436 placebo group deaths) (hazard ratio [HR], 1.04, 95%% CI, 0.91-1.18).", "No significant effects were demonstrated for secondary outcomes or adverse events: there were 129 MIs in the vitamin group vs 150 for placebo (HR, 0.86, 95%% CI, 0.67-1.08), 37 strokes in the vitamin group vs 41 for placebo (HR, 0.90, 95%% CI, 0.58-1.40), and 60 amputations in the vitamin group vs 53 for placebo (HR, 1.14, 95%% CI, 0.79-1.64).", "In addition, the composite of MI, stroke, and amputations plus mortality (P = .85), time to dialysis (P = .38), and time to thrombosis in hemodialysis patients (P = .97) did not differ between the vitamin and placebo groups. \n", "CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n", "TRIAL REGISTRATION clinicaltrials.gov Identifier: NCT00032435."
        </doc>

        <claim>
        40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
        </claim>

        Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.

        <sub-claims>
        40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes.
        2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes.
        </sub-claims>

        <label>
        1
        </label>

        <argument>
        Yes. The information provided indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes.
        </argument>

        You are given a claim in the <claim></claim> tags and a document as evidence in <doc></doc> tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not.
        A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity.
        If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines.

        <guidelines>
        1. Evaluate the claim's plausibility based on general medical knowledge.
        2. Consider the specificity and credibility of any numbers or percentages.
        3. Analyze the context and scope of the claim.
        4. Assess any potential biases or limitations.
        </guidelines>

        <doc>
    #     %s
        </doc>

        <claim>
    #     %s
        </claim>

        Break the claim into causal sub-claims and place them in <sub-claims></sub-claims> tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between <label></label> tags with values 1 (supported) or 0 (refuted) and add your explanations in <argument></argument> XML tags. Skip the preamble.
        '''%(doc,claim)
        return prompt

    def parse_output_response(self, response):
        soup = BeautifulSoup(response, 'html.parser')
        explanation_list = soup.find_all("explanation")
        explanation_text = ""
        for exp in explanation_list:
            if exp.string != None:
                explanation_text += exp.string + ' '
            else:
                explanation_text = response
        explanation_text = ' '.join(explanation_text.split())
        if len(soup.find_all("label")) > 0:
            labels = soup.find_all("label")[-1].string.strip()
        else:
            labels = "Unknown"
        return labels, explanation_text

    def parse_output_response_w_category(self, response):
        soup = BeautifulSoup(response, 'html.parser')
        explanation_list = soup.find_all("explanation")
        explanation_text = ""
        for exp in explanation_list:
            if exp.string != None:
                explanation_text += exp.string + ' '
            else:
                explanation_text = response
        explanation_text = ' '.join(explanation_text.split())

        category_list = soup.find_all("category")
        category_text = ""
        for exp in category_list:
            if exp.string != None:
                category_text += exp.string + ' '
            else:
                category_text = ""
        category_text = ' '.join(category_text.split())

        if len(soup.find_all("label")) > 0:
            labels = soup.find_all("label")[-1].string.strip()
        else:
            labels = "Unknown"

        return labels, category_text, explanation_text

    def parse_output_w_chat_label(self, response):
        soup = BeautifulSoup(response, 'html.parser')
        argument_list = soup.find_all("argument")
        argument_text = ""
        for argument in argument_list:
            if argument.string != None:
                argument_text += argument.string + ' '
            else:
                argument_text = response
        argument_text = ' '.join(argument_text.split())
        if len(soup.find_all("label")) > 0:
            guidelines = soup.find_all("label")[0].string.strip()
        else:
            guidelines = "Unknown"

        return argument_text, guidelines

    def parse_output_response_w_causal_subclaims(self, response):
        soup = BeautifulSoup(response, 'html.parser')
        argument_list = soup.find_all("argument")
        argument_text = ""
        for argument in argument_list:
            if argument.string != None:
                argument_text += argument.string + ' '
            else:
                argument_text = response

        argument_text = ' '.join(argument_text.split())
        if len(soup.find_all("label")) > 0:
            label = soup.find_all("label")[0].string.strip()
        else:
            label = "Unknown"

        sub_claims_text = ""
        if len(soup.find_all("sub-claims")) > 0:
            sub_claims_list = soup.find_all("sub-claims")
            for claim in sub_claims_list:
                    if claim.string != None:
                        sub_claims_text += claim.string + '\n'

        return label, argument_text, sub_claims_text

    """# OpenAI Prep"""

    def get_client(self):
      self.client = OpenAI(api_key="",
                      organization="")
      return self.client

    #client = get_client()
    def parse_chatgpt_api_response(self, response):
        choices = response.choices
        # choices = response["choices"]
        main_response_message_list = []
        if len(choices) > 1:
            for choice in choices:
                main_response = choice.message
                # main_response_message, main_response_role = main_response["content"], main_response["role"]
                main_response_message, main_response_role = main_response.content, main_response.role
                main_response_message_list.append(main_response_message)
            return main_response_message_list, response

        else:
            main_response = choices[0].message
            # main_response_message, main_response_role = main_response["content"], main_response["role"]
            main_response_message, main_response_role = main_response.content, main_response.role
            return main_response_message, response

    def make_openai_api_call(self, prompt, model_name, temperature):
        if 'gpt-3' in model_name or'gpt-4' in model_name:
            # openai.ChatCompletion.create
            response = self.client.chat.completions.create(
                model=model_name,
                messages=[{'role': 'user', 'content': prompt}],
                temperature=temperature,
                max_tokens=4096,
                top_p=1.0,
                frequency_penalty=0.0,
                presence_penalty=0.0,
                n=1,
            )
            return self.parse_chatgpt_api_response(response)

    def make_openai_api_call_o3_mini(self, prompt, model_name, temperature):
        response = self.client.chat.completions.create(
        model=model_name,
        messages=[{'role': 'user', 'content': prompt}],
        response_format={
            "type": "text"
        },
        reasoning_effort="medium"
        )
        return self.parse_chatgpt_api_response(response)

    def read_file(self, file_path):
        all_data = []
        with open(file_path, 'r') as input_file:
            for line in input_file:
                line = line.strip()
                data = json.loads(line)
                all_data.append(data)
        return all_data

    def safe_print(self, x, *args):
        print(x)

    def __call__(self, doc, claim, initialization=True, model_name='gpt-4o-mini',
                 initial_agent_responses=None,
                  writer=safe_print):
        # number of simultaneous debates for evaluation
        num_debates = 1
        eval_repeat_max = 0

        ## initilaize a dictionary to save the outputs of each separate debate
        debates_dict = dict.fromkeys([0],None)
        overall_ambiguity = False
        initialization = initialization

        ## keep starting debates until you reach the max numer of debates
        while eval_repeat_max != num_debates:
            ambiguous = False
            results = {}
            doc = doc
            sent = claim

            ## intial stance assignment. We use the follwoing list of utterances as the first reponse of each agent and then use
            ## this as the chat history to start the debate. The default value is 4. You can change the number of agents by adding
            ## more utterances

            if initialization:
                if initial_agent_responses is None:
                    agents_responses = ["The claim is not refuted by evidence.", "The claim is refuted by evidence.", "The claim is not refuted by evidence.", "The claim is refuted by evidence."]
                else:
                    agents_responses = []
                    for n in range(4):
                        if n < len(initial_agent_responses):
                            agents_responses.append(initial_agent_responses[n])
                        else:
                            if n % 2 == 0:
                                agents_responses.append("The claim is not refuted by evidence.")
                            else:
                                agents_responses.append("The claim is refuted by evidence.")

            else:
                agents_responses = ["","","",""]

            updated_responses = []

            ## to keep track of previous responses of agents and provide them in each round
            message_board = ['','','','']

            ## intialize a label list to keep track of agents judgements
            label_list = [[1],[0],[1],[0]]
            all_chats = []

            ## number of rounds of debates
            turns = 3

            mediator_feedback = ""
            ## first round of random assessment not included in the history.
            round_counter = 0
            if initialization:
                print("ROUND %s: (This is the initialization round where agents are assigned initial stance as their beliefs.)\n"%str(round_counter+1))
                for n in range(len(agents_responses)):
                    writer("Agent %s: "%str(n+1) + agents_responses[n] + "\n",
                           "This is my initial belief.")
                print("----------------------------------------------------")
                round_counter += 1
            print("ROUND %s:\n"%str(round_counter+1))
            for n in range(len(agents_responses)):
                chat_history = ""
                chat_history_prompt = ''
                chat_history_prompt +=  message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
                chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
                other_agents_response = ""
                for nn in range(len(agents_responses)):
                    if nn != n:
                        other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
                        chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"

                message_board[n] += chat_history
                chat_history_prompt += other_agents_response

                ## For experiments wo initial stance uncomment the following line to clear the chat history
                if not initialization:
                    chat_history_prompt = ""

                ## the parameters to prompt module include the document, the claim sentence, previous chat history and mediator feedback
                ## that you can use to modify the goals of agents
                if doc != "":
                    prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback)
                else:
                    prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback)
                argument = ""
                rep_ctr = 0
                label = -1
                label_val = -1

                ## to make sure we have enough initial diversity in responses, we repeat the following such that if the immediate
                ## response is different from the assigned stance, the agent is asked to repeat its generation. The rep_ctr is used
                ## to repaet 2 times before moving on to the next stage
                while label!="Unknown" and label_val != label_list[n][0] and rep_ctr != 1:
                    llm_response, _ = self.make_openai_api_call(prompt, model_name, 1)
                    argument, label = self.parse_output_w_chat_label(llm_response)
                    print(f">>>>>>>\n\t{label}\n")
                    strlabel = "Support" if label == "1" else "Refute"
                    writer("Agent %s's Assessment:\n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n")
                    print("***************")
                    rep_ctr += 1

                    ## the generated label might not be in correct format so we use the following to make sure the label format is correct
                    if label != "Unknown":
                        if len(label.split()) != 0 and ',' not in label.split()[0]:
                            label_val = float(label.split()[0])
                        elif len(label.split()) == 0 or ',' in label.split()[0]:
                            if len(label.split(',')) != 0:
                                label_val = float(label.split(',')[0])
                            else:
                                label_val = float(label)

                        if label_val >= 0.5:
                            label_val = 1
                        else:
                            label_val = 0

                if label != "Unknown":
                    if len(label.split()) != 0 and ',' not in label.split()[0]:
                        label_val = float(label.split()[0])
                    elif len(label.split()) == 0 or ',' in label.split()[0]:
                        if len(label.split(',')) != 0:
                            label_val = float(label.split(',')[0])
                        else:
                            label_val = float(label)

                    if label_val >= 0.5:
                        label_list[n].append(1)
                    else:
                        label_list[n].append(0)
                else:
                    label_list[n].append(label_list[n][-1])
                argument = argument.strip()

                updated_responses.append(argument)
            agents_responses = copy.deepcopy(updated_responses)

            ## Once the first round is generated, we start the debate among agents
            message_board = ['','','','']
            for ag, ag_resp in enumerate(agents_responses):
                all_chats.append("Agent %s:\n"%str(ag+1) + ag_resp)

            mediator_feedback = ""

            ## The debate is continued for "turns" time.
            for cnt in range(turns):
                if len(set([lbl_list[-1] for lbl_list in label_list])) == 1:
                    break
                print("----------------------------------------------------")
                round_counter += 1
                print("ROUND %s:\n"%str(round_counter+1))
                updated_responses = []
                for n in range(len(agents_responses)):
                    chat_history = ""
                    chat_history_prompt = ''
                    chat_history_prompt +=  message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
                    chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n"
                    other_agents_response = ""
                    for nn in range(len(agents_responses)):
                        if nn != n:
                            other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"
                            chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n"

                    message_board[n] += chat_history
                    chat_history_prompt += other_agents_response

                    ## to shuffle the order of chat history to remove any biases caused by order of chats
                    new_chat_history_list = []
                    chat_history_prompt_list = chat_history_prompt.split('\n')
                    chat_history_prompt_list = [chat_hist for chat_hist in chat_history_prompt_list if chat_hist != ""]
                    for pq in range(0,len(chat_history_prompt_list),len(agents_responses)):
                        shuffled_list = chat_history_prompt_list[pq:pq+len(agents_responses)]
                        random.shuffle(shuffled_list)
                        new_chat_history_list += shuffled_list
                    chat_history_prompt = '\n'.join(new_chat_history_list)

                    ## you can add any type of feedback here and add them to prompt to improve the debate consensus
                    ## we do it after the first round
                    # if cnt >= 1:
                    #     mediator_feedback = " Look back at the guidelines and how you have used them. Make sure all guidelines (and not only a subset of them) are satisfied in your assessment. Change your stance if you have made an error or if the other agents are more convincing."
                    mediator_feedback = ""

                    if doc != "":
                        prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback)
                    else:
                        prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback)
                    llm_response, _ = self.make_openai_api_call(prompt, model_name, 1)
                    # print(llm_response)
                    # print("***************")
                    argument, label = self.parse_output_w_chat_label(llm_response)
                    strlabel = "Support" if label == "1" else "Refute"
                    writer("Agent %s's Assessment: \n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n")
                    print("***************")
                    if label != "Unknown":
                        if len(label.split()) != 0 and ',' not in label.split()[0]:
                            label_val = float(label.split()[0])
                        elif len(label.split()) == 0 or ',' in label.split()[0]:
                            if len(label.split(',')) != 0:
                                label_val = float(label.split(',')[0])
                            else:
                                label_val = float(label)

                        if label_val >= 0.5:
                            label_list[n].append(1)
                        else:
                            label_list[n].append(0)
                    else:
                        label_list[n].append(label_list[n][-1])
                    argument = argument.strip()

                    updated_responses.append(argument)
                    all_chats.append('Agent %s:\n'%str(n+1) + argument)
                agents_responses = copy.deepcopy(updated_responses)
                if len(set([lbl_list[-1] for lbl_list in label_list])) == 1:
                    break

            #print(label_list)
            label_list_text = [["Supported" if item == 1 else "Refuted" for item in lbl] for lbl in label_list]
            print('----------------------------------------------------')
            for lbl in range(len(label_list_text)):
                print("Agent %s trajectory:\n%s\n"%(str(lbl+1), label_list_text[lbl]))


            pn_list = [lbl[-1] for lbl in label_list]
            debate_arguments = copy.deepcopy(all_chats[-len(agents_responses):])

            ## we record the outputs of the debate in a dictionary that was previously initialized.
            ## the "change" key keeps track of the number of agents who changes their stance during debate.
            ## this can be used to identify the ambiguous cases directly.
            if pn_list.count(0) == pn_list.count(1):
                debates_dict[eval_repeat_max] = {'change': 0, 'label': -1,'arguments': debate_arguments,'labels': label_list}

                all_chats_dict = {}
                for n_agents in range(len(debate_arguments)):
                    all_chats_dict['Agent %s:'%str(n_agents+1)] = ""

                for cht_counter, cht in enumerate(debate_arguments):
                    all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' '

                ## if there is not a winner label, we use adjudicators to decide on the final label.
                ## you can use multiple adjudicators if you want to do majority voting among them.
                adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict]
                if doc != "":
                    adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input))
                else:
                    adjudicator_prompt = self.get_adjudicator_prompt(sent, '\n'.join(adjudicator_input))
                rep_counter = 0
                adjudicator_label_list = []
                label = ""
                explanation_list = []
                for i in range(1):
                    while label == "" and rep_counter != 5:
                        adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0)
                        label ,  explanation  = self.parse_output_response(adjudicator_response)
                        explanation_list.append(explanation)
                        writer(label, explanation)
                        print('********')
                        if label != "Unknown":
                            if len(label.split()) != 0 and ',' not in label.split()[0]:
                                label_val = float(label.split()[0])
                            elif len(label.split()) == 0 or ',' in label.split()[0]:
                                if len(label.split(',')) != 0:
                                    label_val = float(label.split(',')[0])
                                else:
                                    label_val = float(label)
                            if label_val >= 0.5:
                                label = 1
                            else:
                                label = 0
                        else:
                            label = -1
                        rep_counter += 1
                    adjudicator_label_list.append(label)
                    label = ""

                if adjudicator_label_list.count(1) >= adjudicator_label_list.count(0):
                    label = 1
                else:
                    label = 0
                debates_dict[eval_repeat_max]['label'] = label

            ## if there is a winner label, we return the winner as the final label of the claim
            elif pn_list.count(0) != pn_list.count(1):
                if pn_list.count(1) >= pn_list.count(0):
                    label = 1
                else:
                    label = 0

                if len(set(pn_list)) == 1:
                    change = len(agents_responses)//2
                else:
                    change = len(agents_responses)//2 - 1
                debates_dict[eval_repeat_max] = {'change': change, 'label': label,'arguments': debate_arguments,'labels': label_list}
                explanation_list = debate_arguments

            eval_repeat_max += 1

        all_label_lists = [debates_dict[item]['labels'] for item in debates_dict]

        ## majority vote out of debate rounds. There is a winner for each debate and then the final winner is the one with the most votes
        debates_majority_vote_list = [debates_dict[item]['label'] for item in debates_dict]
        print(debates_majority_vote_list)
        if debates_majority_vote_list.count(1) == num_debates or debates_majority_vote_list.count(0) == num_debates:
            debate_ambiguity = False
        else:
            debate_ambiguity = True

        if debates_majority_vote_list.count(1)> debates_majority_vote_list.count(0):
            debates_majority_vote = 1
        elif debates_majority_vote_list.count(1) < debates_majority_vote_list.count(0):
            debates_majority_vote = 0
        print(debates_majority_vote)

        changes_in_debates_list = [debates_dict[item]['change'] for item in debates_dict]
        if changes_in_debates_list.count(0) == num_debates:
            ambiguous = "Full"
        elif changes_in_debates_list.count(0) == 0:
            ambiguous = "None"
        else:
            ambiguous = "Partial"

        # if changes_in_debates_list.count(0) != num_debates:
        overall_majority_list = []
        for label_list in all_label_lists:
            change = 0
            pn_list = []
            for lbl in label_list:
                if lbl[0] != lbl[-1]:
                    change += 1
                pn_list.append(lbl[-1])
            overall_majority_list += pn_list

        ## majority vote over all individual agents regardless of which debate they belong to
        if overall_majority_list.count(1)> overall_majority_list.count(0):
            overall_majority_vote = 1
        elif overall_majority_list.count(1) < overall_majority_list.count(0):
            overall_majority_vote = 0
        else:
            overall_ambiguity = True

        ## if there is a winner among the agents responses, we report the majority vote
        if changes_in_debates_list.count(0) != num_debates and overall_ambiguity == False:
            label = overall_majority_vote
            explanation_list = [debates_dict[item]['arguments'] for item in debates_dict]
            adjudicator_list = []
            all_arguments = [debates_dict[item]['arguments'] for item in debates_dict]

        ## if there is NOT a winner among agents responses, we use adjudicators to make the final call
        elif changes_in_debates_list.count(0) == num_debates or overall_ambiguity == True:
            all_arguments = [debates_dict[item]['arguments'] for item in debates_dict]
            all_arguments = [x for xs in all_arguments for x in xs]
            all_chats_dict = {}
            for n_agents in range(len(all_arguments)):
                all_chats_dict['Agent %s:'%str(n_agents+1)] = ""

            for cht_counter, cht in enumerate(all_arguments):
                all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' '

            adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict]

            label_list = []
            label = ""
            explanation_list = []
            for rep in range(3):
                random.shuffle(adjudicator_input)
                if doc != "":
                    adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input))
                else:
                    adjudicator_prompt = get_adjudicator_prompt(sent, '\n'.join(adjudicator_input))
                rep_counter = 0
                while label == "" and rep_counter != 5:
                    adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0)
                    label ,  explanation  = self.parse_output_response(adjudicator_response)
                    explanation_list.append(explanation)
                    writer(label, explanation)
                    print('********')
                    if label != "Unknown":
                        if len(label.split()) != 0 and ',' not in label.split()[0]:
                            label_val = float(label.split()[0])
                        elif len(label.split()) == 0 or ',' in label.split()[0]:
                            if len(label.split(',')) != 0:
                                label_val = float(label.split(',')[0])
                            else:
                                label_val = float(label)
                        if label_val >= 0.5:
                            label = 1
                        else:
                            label = 0
                    else:
                        label = -1
                    rep_counter += 1
                label_list.append(label)
                label = ""

            print(label_list)
            results['adjudicators'] = label_list
            results['adjudicators_agree'] = len(set(label_list)) == 1
            if label_list.count(1) >= label_list.count(0):
                label = 1
            else:
                label = 0

            overall_majority_vote = label
            adjudicator_list = label_list

        label_text = ["contradict" if debates_majority_vote == 0 else "support"]
        return label_text[0]