File size: 3,420 Bytes
5c5cb78
 
 
 
 
640ae52
 
5c5cb78
 
 
 
 
 
 
 
 
 
 
 
 
 
b7383a5
640ae52
 
5c5cb78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6e5853
5c5cb78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5739ac6
337f2d5
 
5c5cb78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import requests
import streamlit as st
from huggingface_hub import InferenceClient
from prompt import default_prompt, prompt_enhanced


# Function to load the README.md directly from the Hugging Face API
def load_readme(dataset_name: str):
    api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md"
    
    try:
        response = requests.get(api_url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        st.error(f"Error loading the README.md: {e}")
        return None

    return response.text

# Function to check if the README content exceeds the token limit
def check_token_limit(content: str, max_tokens: int = 7500):
    if len(content)//4 > max_tokens:
        truncated_content = content[:max_tokens]
        st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.")
        return truncated_content
    return content

# Function to evaluate the quality of the dataset card
def evaluate_readme(readme_content: str, user_prompt: str):
    # Retrieve the inference token from environment variables
    hf_token = os.getenv('HF_TOKEN_INFERENCE')

    # Ensure the token is available
    if not hf_token:
        st.error("The Hugging Face inference token is not configured. Please ensure HF_TOKEN_INFERENCE is set.")
        return None

    # Initialize the inference client with the specified model
    client = InferenceClient(model="meta-llama/Llama-3.3-70B-Instruct", token=hf_token)

    # User-customizable prompt
    prompt = user_prompt.format(readme_content=readme_content)
    
    messages = [
        {'role': 'system', 'content': "You are an expert in legal field especially in Artificial Intelligence and data privacy."},
        {'role': 'user', 'content': prompt}
    ]

    # Call the model to get an evaluation
    response = client.chat_completion(
        model="meta-llama/Meta-Llama-3-70B-Instruct",
        messages=messages,
        tool_choice="auto",
        max_tokens=500,
    )

    return response['choices'][0]['message']['content']

# Streamlit Interface
def main():
    

    st.title("Legal Audit of Dataset Cards")
    st.write("This Space provides an automated tool for auditing dataset cards from a legal perspective.It evaluates dataset documentation based on key legal criteria, such as compliance with data privacy regulations, ethical considerations, and transparency of information.")
    dataset_name = st.text_input("Path to HF Dataset (e.g., amirveyseh/acronym_identification)")

    if dataset_name:
        # Load and display the dataset's README.md
        readme = load_readme(dataset_name)
        
        if readme:
            # Check for token limit and truncate if necessary
            readme = check_token_limit(readme)

            st.subheader("README.md content:")
            st.text_area("README.md", readme, height=200)
            
            # Button to evaluate the documentation
            if st.button("Evaluate dataset documentation"):
                with st.spinner("Audit in progress..."):
                    evaluation_result = evaluate_readme(readme, prompt_enhanced)
                    if evaluation_result:
                        st.subheader("Evaluation Result:")
                        st.write(evaluation_result)

if __name__ == "__main__":
    main()