Spaces:

brichett
/

tsgpt

Running

App Files Files Community

tsgpt / src /enforcement_module /policy_enforcement_decider.py

brichett

Fix bug in handling violation_context from gradio_server

2e46e9b verified 8 months ago

raw

history blame contribute delete

3.13 kB

	from types import ModuleType
	import math
	from langchain.llms import OpenAI
	import requests
	import requests.models


	import requests
	from hamilton.function_modifiers import config




	def enforcement_prompt(user_input: str, violation_context: dict) -> str:
	"""
	Generates the prompt to be sent to the LLM for determining the appropriate enforcement action.
	"""

	print(f"Received enforcement user_input: {user_input}")
	print(f"Received enforcement violation_context: {violation_context}")
	if len(violation_context.keys()) == 1 and "detect_glorification" in violation_context.keys():
	violation_context = violation_context['detect_glorification']
	dio_name = violation_context.get("entity_name", "None")
	dio_details = violation_context.get("entity_info", {}).get("Summary", "Unknown")
	dio_category = violation_context.get("entity_info", {}).get("Policy Category", "Unknown")
	radicalization_flag = violation_context.get("extremism_detected", False)
	dio_sentiment = violation_context.get("aspect_sentiment", "None")

	prompt = (

	f"Based on the following content and context, determine the appropriate enforcement action:\\n\\n"
	f"Content: '{user_input}'\\n"
	f"Radicalization detected: {radicalization_flag}\\n"
	f"DIO Mentioned: {dio_name} (Category: {dio_category})\\n"
	f"System's Classification of Content Sentiment Towards DIO: {dio_sentiment}"
	# f"{user_info}\\n\\n"
	"Policy Summary: Any entity mentioned in the 'DIO Mentioned' field is from the Dangerous Individuals & Organizations (DIO) blacklist. "
	"If this entity is mentioned in the user content in a supportive, glorifying, or representative manner, it constitutes a violation of our platform's community standards. "
	"Our platform does not allow content that glorifies, supports, or represents individuals or organizations involved in violence, hate, or terrorism. "
	"These entities include terrorist groups, hate groups, violent non-state actors, and others who advocate or engage in violent activities. "
	"Any content that endorses or supports these entities or their actions is a policy violation. "
	"Users may discuss these topics neutrally or in a condemnatory manner, but any positive portrayal is prohibited. "
	"Based on this policy, choose one of the following enforcement actions: Warning, Limited Visibility, Temporary Suspension, Permanent Ban. "
	"Provide a brief explanation for your choice."
	)
	return prompt

	def get_enforcement_decision(enforcement_prompt: str, mistral_public_url: str) -> dict:
	"""
	Sends the enforcement prompt to the Mistral model server and retrieves the enforcement decision.
	"""
	input_text = {
	"context": enforcement_prompt,
	"question": "What is the appropriate enforcement action?"
	}

	response = requests.post(f'{mistral_public_url}/mistral-inference', json=input_text, stream=False)

	return {
	"enforcement_action": response.text.strip(),
	"prompt": enforcement_prompt
	}