# -*- coding: utf-8 -*- """Untitled37.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1FbaYZ7tAm87yWo_lf87t2ynNPMR5olB- # Prep """ ## load helper functions import json import openai import copy import random from openai import OpenAI ## parsing functions from bs4 import BeautifulSoup class MultiAgentDebate: def __init__(self, client=None): if client is not None: self.client = client else: self.client = self.get_client() def get_prompt_direct_eval(self, claim): prompt = ''' You are given a claim in the tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s Determine if the claim is supported or not. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your explanations in XML tags. Skip the preamble. '''%(claim) return prompt def get_prompt_direct_eval_w_doc(self, doc, claim): prompt = ''' You are given a claim in the tags and a document as evidence in tags. Your job is to analyze a given claim with respect to the given evidence and decide whether the claim is supported or not. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s # %s Determine if the claim is supported or not given the document as the evidence. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your explanations in XML tags. Skip the preamble. '''%(doc,claim) return prompt def get_prompt_debate(self, claim, chat_history, mediator_feedback): prompt = ''' You are given a claim in the tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in tags below. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s # %s The tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your arguments in XML tags. Skip the preamble. '''%(claim,chat_history,mediator_feedback) return prompt def get_adjudicator_prompt(self, claim, chat_history): prompt = ''' You are given a claim in the tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between tags. Your job is to analyze a given claim and decide whether the claim is supported or not. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s # %s Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your arguments in XML tags. Skip the preamble. '''%(claim,chat_history) #Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between tags. Skip the preamble. return prompt def get_prompt_debate_w_doc(self, doc, claim, chat_history, mediator_feedback): prompt = ''' You are given a claim in the tags and a document as evidence in tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines. There are also other evaluator agents assigned the same task as you and you can also see the discussion history in tags below. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s # %s # %s The tag might be empty if this is the first round of evaluation. You can see your previous responses as well as other agents responses. Continue the discussion with other evaluator agents, talk to them and state why you agree/disagree with each other bringing as many arguments as you can. %s Determine if the claim is supported or not. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your arguments in XML tags. Skip the preamble. '''%(doc, claim,chat_history,mediator_feedback) return prompt def get_adjudicator_prompt_w_doc(self, doc, claim, chat_history): prompt = ''' You are given a claim in the tags, a document as evidence in tags and multiple judgments from evaluator agents. You go over the discussion between the agents and their arguments shown in between tags. Your job is to analyze a given claim and decide whether the claim is supported or not with respect to the given evidence. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s # %s # %s Go over the agents responses, summarize them by saying who agrees/disagrees. Then looking at the agents responses, how well they are associated with the guidelines and finally your own judgement, determine if the claim is supported or not. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your arguments in XML tags. Skip the preamble. '''%(doc, claim,chat_history) #Go over the agents responses, summarize them by saying who agrees/disagrees and make sure the arguments correctly used the provided guidelines. Then based on the correctness of agents responses and your own judegment of the summary using the provided guidelines, determine if the sentence is factually consistent with the document. A summary is factually inconsistent if there is a correct argument describing an error or discrepancy in the summary. Provide your evaluation using a JSON format with keys as "label" with values 1 (consistent) or 0 (inconsistent) and "explanation" and put your response between tags. Skip the preamble. return prompt def get_prompt_direct_w_causal_sub_claims(self, claim): prompt = ''' You are given a claim in the tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not. A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity. If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. 40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression. Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your explanations in XML tags. Skip the preamble. 40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes. 2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes. Yes. There is a study that indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes. You are given a claim in the tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim and decide whether the whole claim is supported or not. A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity. If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s Break the claim into causal sub-claims and place them in tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your explanations in XML tags. Skip the preamble. '''%(claim) return prompt def get_prompt_direct_w_doc_w_causal_sub_claims(self, doc, claim): prompt = ''' You are given a claim in the tags and a document as evidence in tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not. A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity. If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. High plasma homocysteine levels are a risk factor for mortality and vascular disease in observational studies of patients with chronic kidney disease.", "Folic acid and B vitamins decrease homocysteine levels in this population but whether they lower mortality is unknown. \n", "OBJECTIVE To determine whether high doses of folic acid and B vitamins administered daily reduce mortality in patients with chronic kidney disease. \n", "DESIGN, SETTING, AND PARTICIPANTS Double-blind randomized controlled trial (2001-2006) in 36 US Department of Veterans Affairs medical centers.", "Median follow-up was 3.2 years for 2056 participants aged 21 years or older with advanced chronic kidney disease (estimated creatinine clearance < or =30 mL/min) (n = 1305) or end-stage renal disease (n = 751) and high homocysteine levels (> or = 15 micromol/L). \n", "INTERVENTION Participants received a daily capsule containing 40 mg of folic acid, 100 mg of pyridoxine hydrochloride (vitamin B6), and 2 mg of cyanocobalamin (vitamin B12) or a placebo. \n", "MAIN OUTCOME MEASURES The primary outcome was all-cause mortality.", "Secondary outcomes included myocardial infarction (MI), stroke, amputation of all or part of a lower extremity, a composite of these 3 plus all-cause mortality, time to initiation of dialysis, and time to thrombosis of arteriovenous access in hemodialysis patients. \n", "RESULTS Mean baseline homocysteine level was 24.0 micromol/L in the vitamin group and 24.2 micromol/L in the placebo group.", "It was lowered 6.3 micromol/L (25.8%%, P < .001) in the vitamin group and 0.4 micromol/L (1.7%%, P = .14) in the placebo group at 3 months, but there was no significant effect on mortality (448 vitamin group deaths vs 436 placebo group deaths) (hazard ratio [HR], 1.04, 95%% CI, 0.91-1.18).", "No significant effects were demonstrated for secondary outcomes or adverse events: there were 129 MIs in the vitamin group vs 150 for placebo (HR, 0.86, 95%% CI, 0.67-1.08), 37 strokes in the vitamin group vs 41 for placebo (HR, 0.90, 95%% CI, 0.58-1.40), and 60 amputations in the vitamin group vs 53 for placebo (HR, 1.14, 95%% CI, 0.79-1.64).", "In addition, the composite of MI, stroke, and amputations plus mortality (P = .85), time to dialysis (P = .38), and time to thrombosis in hemodialysis patients (P = .97) did not differ between the vitamin and placebo groups. \n", "CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n", "TRIAL REGISTRATION clinicaltrials.gov Identifier: NCT00032435." 40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression. Break the claim into any possible number of causal sub-claims with explicit causal relations and place them in tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your explanations in XML tags. Skip the preamble. 40mg/day dosage of folic acid causes chronic kidney disease (CKD) progression changes. 2mg/day dosage of vitamin B12 causes chronic kidney disease (CKD) progression changes. Yes. The information provided indicates that treatment with high doses of folic acid (40 mg/day) did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. The study found no significant effect on mortality or secondary outcomes related to CKD progression, such as myocardial infarction, stroke, amputation, time to dialysis, or thrombosis in hemodialysis patients. Therefore, the claim that these dosages do not affect CKD progression is supported by the study's findings. It is also mentioned that patients who received vitamin B12 (2 mg/day), also did not show any significant effects on outcomes. You are given a claim in the tags and a document as evidence in tags. Your job is to first break the claim into causal sub-claims and then analyze each sub-claim with respect to the given evidence and decide whether the whole claim is supported or not. A causal sub-claim represents a causal relation between two entities from the claim. This is not simply a statement but rather an explicit causal relation in which one entity (or its change) can have an effect on the other entity. If all sub-claims are supported, then the claim is also supported. You should also consider the provided guidelines. 1. Evaluate the claim's plausibility based on general medical knowledge. 2. Consider the specificity and credibility of any numbers or percentages. 3. Analyze the context and scope of the claim. 4. Assess any potential biases or limitations. # %s # %s Break the claim into causal sub-claims and place them in tags. Determine if the claim is supported or not given the document as the evidence by analyzing its causal sub-claims. Provide your evaluation between tags with values 1 (supported) or 0 (refuted) and add your explanations in XML tags. Skip the preamble. '''%(doc,claim) return prompt def parse_output_response(self, response): soup = BeautifulSoup(response, 'html.parser') explanation_list = soup.find_all("explanation") explanation_text = "" for exp in explanation_list: if exp.string != None: explanation_text += exp.string + ' ' else: explanation_text = response explanation_text = ' '.join(explanation_text.split()) if len(soup.find_all("label")) > 0: labels = soup.find_all("label")[-1].string.strip() else: labels = "Unknown" return labels, explanation_text def parse_output_response_w_category(self, response): soup = BeautifulSoup(response, 'html.parser') explanation_list = soup.find_all("explanation") explanation_text = "" for exp in explanation_list: if exp.string != None: explanation_text += exp.string + ' ' else: explanation_text = response explanation_text = ' '.join(explanation_text.split()) category_list = soup.find_all("category") category_text = "" for exp in category_list: if exp.string != None: category_text += exp.string + ' ' else: category_text = "" category_text = ' '.join(category_text.split()) if len(soup.find_all("label")) > 0: labels = soup.find_all("label")[-1].string.strip() else: labels = "Unknown" return labels, category_text, explanation_text def parse_output_w_chat_label(self, response): soup = BeautifulSoup(response, 'html.parser') argument_list = soup.find_all("argument") argument_text = "" for argument in argument_list: if argument.string != None: argument_text += argument.string + ' ' else: argument_text = response argument_text = ' '.join(argument_text.split()) if len(soup.find_all("label")) > 0: guidelines = soup.find_all("label")[0].string.strip() else: guidelines = "Unknown" return argument_text, guidelines def parse_output_response_w_causal_subclaims(self, response): soup = BeautifulSoup(response, 'html.parser') argument_list = soup.find_all("argument") argument_text = "" for argument in argument_list: if argument.string != None: argument_text += argument.string + ' ' else: argument_text = response argument_text = ' '.join(argument_text.split()) if len(soup.find_all("label")) > 0: label = soup.find_all("label")[0].string.strip() else: label = "Unknown" sub_claims_text = "" if len(soup.find_all("sub-claims")) > 0: sub_claims_list = soup.find_all("sub-claims") for claim in sub_claims_list: if claim.string != None: sub_claims_text += claim.string + '\n' return label, argument_text, sub_claims_text """# OpenAI Prep""" def get_client(self): self.client = OpenAI(api_key="", organization="") return self.client #client = get_client() def parse_chatgpt_api_response(self, response): choices = response.choices # choices = response["choices"] main_response_message_list = [] if len(choices) > 1: for choice in choices: main_response = choice.message # main_response_message, main_response_role = main_response["content"], main_response["role"] main_response_message, main_response_role = main_response.content, main_response.role main_response_message_list.append(main_response_message) return main_response_message_list, response else: main_response = choices[0].message # main_response_message, main_response_role = main_response["content"], main_response["role"] main_response_message, main_response_role = main_response.content, main_response.role return main_response_message, response def make_openai_api_call(self, prompt, model_name, temperature): if 'gpt-3' in model_name or'gpt-4' in model_name: # openai.ChatCompletion.create response = self.client.chat.completions.create( model=model_name, messages=[{'role': 'user', 'content': prompt}], temperature=temperature, max_tokens=4096, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0, n=1, ) return self.parse_chatgpt_api_response(response) def make_openai_api_call_o3_mini(self, prompt, model_name, temperature): response = self.client.chat.completions.create( model=model_name, messages=[{'role': 'user', 'content': prompt}], response_format={ "type": "text" }, reasoning_effort="medium" ) return self.parse_chatgpt_api_response(response) def read_file(self, file_path): all_data = [] with open(file_path, 'r') as input_file: for line in input_file: line = line.strip() data = json.loads(line) all_data.append(data) return all_data def safe_print(self, x, *args): print(x) def __call__(self, doc, claim, initialization=True, model_name='gpt-4o-mini', initial_agent_responses=None, writer=safe_print): # number of simultaneous debates for evaluation num_debates = 1 eval_repeat_max = 0 ## initilaize a dictionary to save the outputs of each separate debate debates_dict = dict.fromkeys([0],None) overall_ambiguity = False initialization = initialization ## keep starting debates until you reach the max numer of debates while eval_repeat_max != num_debates: ambiguous = False results = {} doc = doc sent = claim ## intial stance assignment. We use the follwoing list of utterances as the first reponse of each agent and then use ## this as the chat history to start the debate. The default value is 4. You can change the number of agents by adding ## more utterances if initialization: if initial_agent_responses is None: agents_responses = ["The claim is not refuted by evidence.", "The claim is refuted by evidence.", "The claim is not refuted by evidence.", "The claim is refuted by evidence."] else: agents_responses = [] for n in range(4): if n < len(initial_agent_responses): agents_responses.append(initial_agent_responses[n]) else: if n % 2 == 0: agents_responses.append("The claim is not refuted by evidence.") else: agents_responses.append("The claim is refuted by evidence.") else: agents_responses = ["","","",""] updated_responses = [] ## to keep track of previous responses of agents and provide them in each round message_board = ['','','',''] ## intialize a label list to keep track of agents judgements label_list = [[1],[0],[1],[0]] all_chats = [] ## number of rounds of debates turns = 3 mediator_feedback = "" ## first round of random assessment not included in the history. round_counter = 0 if initialization: print("ROUND %s: (This is the initialization round where agents are assigned initial stance as their beliefs.)\n"%str(round_counter+1)) for n in range(len(agents_responses)): writer("Agent %s: "%str(n+1) + agents_responses[n] + "\n", "This is my initial belief.") print("----------------------------------------------------") round_counter += 1 print("ROUND %s:\n"%str(round_counter+1)) for n in range(len(agents_responses)): chat_history = "" chat_history_prompt = '' chat_history_prompt += message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n" chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n" other_agents_response = "" for nn in range(len(agents_responses)): if nn != n: other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n" chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n" message_board[n] += chat_history chat_history_prompt += other_agents_response ## For experiments wo initial stance uncomment the following line to clear the chat history if not initialization: chat_history_prompt = "" ## the parameters to prompt module include the document, the claim sentence, previous chat history and mediator feedback ## that you can use to modify the goals of agents if doc != "": prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback) else: prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback) argument = "" rep_ctr = 0 label = -1 label_val = -1 ## to make sure we have enough initial diversity in responses, we repeat the following such that if the immediate ## response is different from the assigned stance, the agent is asked to repeat its generation. The rep_ctr is used ## to repaet 2 times before moving on to the next stage while label!="Unknown" and label_val != label_list[n][0] and rep_ctr != 1: llm_response, _ = self.make_openai_api_call(prompt, model_name, 1) argument, label = self.parse_output_w_chat_label(llm_response) print(f">>>>>>>\n\t{label}\n") strlabel = "Support" if label == "1" else "Refute" writer("Agent %s's Assessment:\n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n") print("***************") rep_ctr += 1 ## the generated label might not be in correct format so we use the following to make sure the label format is correct if label != "Unknown": if len(label.split()) != 0 and ',' not in label.split()[0]: label_val = float(label.split()[0]) elif len(label.split()) == 0 or ',' in label.split()[0]: if len(label.split(',')) != 0: label_val = float(label.split(',')[0]) else: label_val = float(label) if label_val >= 0.5: label_val = 1 else: label_val = 0 if label != "Unknown": if len(label.split()) != 0 and ',' not in label.split()[0]: label_val = float(label.split()[0]) elif len(label.split()) == 0 or ',' in label.split()[0]: if len(label.split(',')) != 0: label_val = float(label.split(',')[0]) else: label_val = float(label) if label_val >= 0.5: label_list[n].append(1) else: label_list[n].append(0) else: label_list[n].append(label_list[n][-1]) argument = argument.strip() updated_responses.append(argument) agents_responses = copy.deepcopy(updated_responses) ## Once the first round is generated, we start the debate among agents message_board = ['','','',''] for ag, ag_resp in enumerate(agents_responses): all_chats.append("Agent %s:\n"%str(ag+1) + ag_resp) mediator_feedback = "" ## The debate is continued for "turns" time. for cnt in range(turns): if len(set([lbl_list[-1] for lbl_list in label_list])) == 1: break print("----------------------------------------------------") round_counter += 1 print("ROUND %s:\n"%str(round_counter+1)) updated_responses = [] for n in range(len(agents_responses)): chat_history = "" chat_history_prompt = '' chat_history_prompt += message_board[n] + "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n" chat_history += "You (Agent %s): "%str(n+1) + agents_responses[n] + "\n" other_agents_response = "" for nn in range(len(agents_responses)): if nn != n: other_agents_response += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n" chat_history += "Agent %s: "%str(nn+1) + agents_responses[nn] + "\n" message_board[n] += chat_history chat_history_prompt += other_agents_response ## to shuffle the order of chat history to remove any biases caused by order of chats new_chat_history_list = [] chat_history_prompt_list = chat_history_prompt.split('\n') chat_history_prompt_list = [chat_hist for chat_hist in chat_history_prompt_list if chat_hist != ""] for pq in range(0,len(chat_history_prompt_list),len(agents_responses)): shuffled_list = chat_history_prompt_list[pq:pq+len(agents_responses)] random.shuffle(shuffled_list) new_chat_history_list += shuffled_list chat_history_prompt = '\n'.join(new_chat_history_list) ## you can add any type of feedback here and add them to prompt to improve the debate consensus ## we do it after the first round # if cnt >= 1: # mediator_feedback = " Look back at the guidelines and how you have used them. Make sure all guidelines (and not only a subset of them) are satisfied in your assessment. Change your stance if you have made an error or if the other agents are more convincing." mediator_feedback = "" if doc != "": prompt = self.get_prompt_debate_w_doc(doc, sent, chat_history_prompt, mediator_feedback) else: prompt = self.get_prompt_debate(sent, chat_history_prompt, mediator_feedback) llm_response, _ = self.make_openai_api_call(prompt, model_name, 1) # print(llm_response) # print("***************") argument, label = self.parse_output_w_chat_label(llm_response) strlabel = "Support" if label == "1" else "Refute" writer("Agent %s's Assessment: \n"%str(n+1) + '%s. \n'%strlabel, 'Explanation: %s'%argument + "\n") print("***************") if label != "Unknown": if len(label.split()) != 0 and ',' not in label.split()[0]: label_val = float(label.split()[0]) elif len(label.split()) == 0 or ',' in label.split()[0]: if len(label.split(',')) != 0: label_val = float(label.split(',')[0]) else: label_val = float(label) if label_val >= 0.5: label_list[n].append(1) else: label_list[n].append(0) else: label_list[n].append(label_list[n][-1]) argument = argument.strip() updated_responses.append(argument) all_chats.append('Agent %s:\n'%str(n+1) + argument) agents_responses = copy.deepcopy(updated_responses) if len(set([lbl_list[-1] for lbl_list in label_list])) == 1: break #print(label_list) label_list_text = [["Supported" if item == 1 else "Refuted" for item in lbl] for lbl in label_list] print('----------------------------------------------------') for lbl in range(len(label_list_text)): print("Agent %s trajectory:\n%s\n"%(str(lbl+1), label_list_text[lbl])) pn_list = [lbl[-1] for lbl in label_list] debate_arguments = copy.deepcopy(all_chats[-len(agents_responses):]) ## we record the outputs of the debate in a dictionary that was previously initialized. ## the "change" key keeps track of the number of agents who changes their stance during debate. ## this can be used to identify the ambiguous cases directly. if pn_list.count(0) == pn_list.count(1): debates_dict[eval_repeat_max] = {'change': 0, 'label': -1,'arguments': debate_arguments,'labels': label_list} all_chats_dict = {} for n_agents in range(len(debate_arguments)): all_chats_dict['Agent %s:'%str(n_agents+1)] = "" for cht_counter, cht in enumerate(debate_arguments): all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' ' ## if there is not a winner label, we use adjudicators to decide on the final label. ## you can use multiple adjudicators if you want to do majority voting among them. adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict] if doc != "": adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input)) else: adjudicator_prompt = self.get_adjudicator_prompt(sent, '\n'.join(adjudicator_input)) rep_counter = 0 adjudicator_label_list = [] label = "" explanation_list = [] for i in range(1): while label == "" and rep_counter != 5: adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0) label , explanation = self.parse_output_response(adjudicator_response) explanation_list.append(explanation) writer(label, explanation) print('********') if label != "Unknown": if len(label.split()) != 0 and ',' not in label.split()[0]: label_val = float(label.split()[0]) elif len(label.split()) == 0 or ',' in label.split()[0]: if len(label.split(',')) != 0: label_val = float(label.split(',')[0]) else: label_val = float(label) if label_val >= 0.5: label = 1 else: label = 0 else: label = -1 rep_counter += 1 adjudicator_label_list.append(label) label = "" if adjudicator_label_list.count(1) >= adjudicator_label_list.count(0): label = 1 else: label = 0 debates_dict[eval_repeat_max]['label'] = label ## if there is a winner label, we return the winner as the final label of the claim elif pn_list.count(0) != pn_list.count(1): if pn_list.count(1) >= pn_list.count(0): label = 1 else: label = 0 if len(set(pn_list)) == 1: change = len(agents_responses)//2 else: change = len(agents_responses)//2 - 1 debates_dict[eval_repeat_max] = {'change': change, 'label': label,'arguments': debate_arguments,'labels': label_list} explanation_list = debate_arguments eval_repeat_max += 1 all_label_lists = [debates_dict[item]['labels'] for item in debates_dict] ## majority vote out of debate rounds. There is a winner for each debate and then the final winner is the one with the most votes debates_majority_vote_list = [debates_dict[item]['label'] for item in debates_dict] print(debates_majority_vote_list) if debates_majority_vote_list.count(1) == num_debates or debates_majority_vote_list.count(0) == num_debates: debate_ambiguity = False else: debate_ambiguity = True if debates_majority_vote_list.count(1)> debates_majority_vote_list.count(0): debates_majority_vote = 1 elif debates_majority_vote_list.count(1) < debates_majority_vote_list.count(0): debates_majority_vote = 0 print(debates_majority_vote) changes_in_debates_list = [debates_dict[item]['change'] for item in debates_dict] if changes_in_debates_list.count(0) == num_debates: ambiguous = "Full" elif changes_in_debates_list.count(0) == 0: ambiguous = "None" else: ambiguous = "Partial" # if changes_in_debates_list.count(0) != num_debates: overall_majority_list = [] for label_list in all_label_lists: change = 0 pn_list = [] for lbl in label_list: if lbl[0] != lbl[-1]: change += 1 pn_list.append(lbl[-1]) overall_majority_list += pn_list ## majority vote over all individual agents regardless of which debate they belong to if overall_majority_list.count(1)> overall_majority_list.count(0): overall_majority_vote = 1 elif overall_majority_list.count(1) < overall_majority_list.count(0): overall_majority_vote = 0 else: overall_ambiguity = True ## if there is a winner among the agents responses, we report the majority vote if changes_in_debates_list.count(0) != num_debates and overall_ambiguity == False: label = overall_majority_vote explanation_list = [debates_dict[item]['arguments'] for item in debates_dict] adjudicator_list = [] all_arguments = [debates_dict[item]['arguments'] for item in debates_dict] ## if there is NOT a winner among agents responses, we use adjudicators to make the final call elif changes_in_debates_list.count(0) == num_debates or overall_ambiguity == True: all_arguments = [debates_dict[item]['arguments'] for item in debates_dict] all_arguments = [x for xs in all_arguments for x in xs] all_chats_dict = {} for n_agents in range(len(all_arguments)): all_chats_dict['Agent %s:'%str(n_agents+1)] = "" for cht_counter, cht in enumerate(all_arguments): all_chats_dict['Agent %s:'%str(cht_counter+1)] += ' '.join(cht.split('\n')[1:]) + ' ' adjudicator_input = [str(item) + ' ' + all_chats_dict[item] for item in all_chats_dict] label_list = [] label = "" explanation_list = [] for rep in range(3): random.shuffle(adjudicator_input) if doc != "": adjudicator_prompt = self.get_adjudicator_prompt_w_doc(doc, sent, '\n'.join(adjudicator_input)) else: adjudicator_prompt = get_adjudicator_prompt(sent, '\n'.join(adjudicator_input)) rep_counter = 0 while label == "" and rep_counter != 5: adjudicator_response, _ = self.make_openai_api_call(adjudicator_prompt, model_name, 1.0) label , explanation = self.parse_output_response(adjudicator_response) explanation_list.append(explanation) writer(label, explanation) print('********') if label != "Unknown": if len(label.split()) != 0 and ',' not in label.split()[0]: label_val = float(label.split()[0]) elif len(label.split()) == 0 or ',' in label.split()[0]: if len(label.split(',')) != 0: label_val = float(label.split(',')[0]) else: label_val = float(label) if label_val >= 0.5: label = 1 else: label = 0 else: label = -1 rep_counter += 1 label_list.append(label) label = "" print(label_list) results['adjudicators'] = label_list results['adjudicators_agree'] = len(set(label_list)) == 1 if label_list.count(1) >= label_list.count(0): label = 1 else: label = 0 overall_majority_vote = label adjudicator_list = label_list label_text = ["contradict" if debates_majority_vote == 0 else "support"] return label_text[0]