File size: 3,420 Bytes
5c5cb78 640ae52 5c5cb78 b7383a5 640ae52 5c5cb78 c6e5853 5c5cb78 5739ac6 337f2d5 5c5cb78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
import requests
import streamlit as st
from huggingface_hub import InferenceClient
from prompt import default_prompt, prompt_enhanced
# Function to load the README.md directly from the Hugging Face API
def load_readme(dataset_name: str):
api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md"
try:
response = requests.get(api_url)
response.raise_for_status()
except requests.exceptions.RequestException as e:
st.error(f"Error loading the README.md: {e}")
return None
return response.text
# Function to check if the README content exceeds the token limit
def check_token_limit(content: str, max_tokens: int = 7500):
if len(content)//4 > max_tokens:
truncated_content = content[:max_tokens]
st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.")
return truncated_content
return content
# Function to evaluate the quality of the dataset card
def evaluate_readme(readme_content: str, user_prompt: str):
# Retrieve the inference token from environment variables
hf_token = os.getenv('HF_TOKEN_INFERENCE')
# Ensure the token is available
if not hf_token:
st.error("The Hugging Face inference token is not configured. Please ensure HF_TOKEN_INFERENCE is set.")
return None
# Initialize the inference client with the specified model
client = InferenceClient(model="meta-llama/Llama-3.3-70B-Instruct", token=hf_token)
# User-customizable prompt
prompt = user_prompt.format(readme_content=readme_content)
messages = [
{'role': 'system', 'content': "You are an expert in legal field especially in Artificial Intelligence and data privacy."},
{'role': 'user', 'content': prompt}
]
# Call the model to get an evaluation
response = client.chat_completion(
model="meta-llama/Meta-Llama-3-70B-Instruct",
messages=messages,
tool_choice="auto",
max_tokens=500,
)
return response['choices'][0]['message']['content']
# Streamlit Interface
def main():
st.title("Legal Audit of Dataset Cards")
st.write("This Space provides an automated tool for auditing dataset cards from a legal perspective.It evaluates dataset documentation based on key legal criteria, such as compliance with data privacy regulations, ethical considerations, and transparency of information.")
dataset_name = st.text_input("Path to HF Dataset (e.g., amirveyseh/acronym_identification)")
if dataset_name:
# Load and display the dataset's README.md
readme = load_readme(dataset_name)
if readme:
# Check for token limit and truncate if necessary
readme = check_token_limit(readme)
st.subheader("README.md content:")
st.text_area("README.md", readme, height=200)
# Button to evaluate the documentation
if st.button("Evaluate dataset documentation"):
with st.spinner("Audit in progress..."):
evaluation_result = evaluate_readme(readme, prompt_enhanced)
if evaluation_result:
st.subheader("Evaluation Result:")
st.write(evaluation_result)
if __name__ == "__main__":
main()
|