import gradio as gr from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch # Load the model and tokenizer tokenizer = AutoTokenizer.from_pretrained("BenjaminOcampo/peace_hatebert") model = AutoModelForSequenceClassification.from_pretrained("BenjaminOcampo/peace_hatebert") # Define more nuanced labels for the model output nuanced_labels = { 0: "Non-Hate Speech", 1: "Explicit Hate", 2: "Implicit Hate", 3: "White Grievance" } # Microaggressions detection rules microaggressions = { "You're so articulate": "This phrase can imply surprise that the individual can speak well, often used in a way that suggests it is unexpected for someone of their background.", "Where are you really from": "This question implies that the individual does not belong or is not truly part of the community.", "I don't see color": "This statement can negate the experiences and identities of people of different races.", "You're a credit to your race": "This phrase implies that most people of the individual’s race are not successful or commendable." } # A sample set of explanations and suggestions bias_suggestions = { "Explicit Hate": { "suggestion": "Consider using language that promotes inclusivity and respect.", "explanation": "The text contains explicit hate speech, which is overtly harmful and discriminatory. It is important to foster communication that is inclusive and respectful of all individuals." }, "Implicit Hate": { "suggestion": "Try rephrasing to avoid subtle bias and ensure clarity.", "explanation": "The text contains implicit hate speech, which can perpetuate stereotypes and bias in a less overt manner. Aim for language that is clear and free from insinuations." }, "White Grievance": { "suggestion": "Reconsider any generalized claims about racial groups.", "explanation": "The text appears to express grievances linked to racial identity, which can contribute to divisive narratives. Strive for dialogue that acknowledges diversity and avoids stereotyping." }, "Non-Hate Speech": { "suggestion": "No problematic content detected.", "explanation": "The text does not appear to contain hate speech or bias. It seems respectful and neutral." }, "Microaggression": { "suggestion": "Be mindful of how certain phrases can be interpreted by others.", "explanation": "The text includes phrases that may be considered microaggressions, which can subtly perpetuate stereotypes or biases." } } def analyze_text(text): # Tokenize input text inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) # Get model predictions with torch.no_grad(): outputs = model(**inputs) # Get prediction probabilities probs = torch.nn.functional.softmax(outputs.logits, dim=-1) predicted_class = torch.argmax(probs, dim=-1).item() # Map the predicted class to the nuanced label label = nuanced_labels.get(predicted_class, "Unknown") # Check for microaggressions using the predefined rules for phrase, explanation in microaggressions.items(): if phrase.lower() in text.lower(): label = "Microaggression" suggestion = bias_suggestions[label]["suggestion"] explanation = bias_suggestions[label]["explanation"] return label, suggestion, explanation # Fetch suggestion and explanation based on label suggestion = bias_suggestions[label]["suggestion"] explanation = bias_suggestions[label]["explanation"] return label, suggestion, explanation # Create the Gradio interface interface = gr.Interface( fn=analyze_text, inputs=gr.Textbox(lines=5, placeholder="Enter text to analyze..."), outputs=[ gr.Textbox(label="Classification"), gr.Textbox(label="Suggestion"), gr.Textbox(label="Explanation") ], title="Proofreading for Implicit Bias, Microagressions - inital model test", description="Analyze text for nuanced bias categories such as implicit hate, explicit hate, or white grievance, and detect microaggressions to provide suggestions for improvement - step 1: prompt testing. Credit to https://huggingface.co/BenjaminOcampo" ) # Launch the interface interface.launch()