Spaces:

WordLift
/

synthID

Sleeping

File size: 9,076 Bytes

5ea9e86
9c1cc06
bf8477e
5ea9e86
a9e6964
 
5ed2c98
9c1cc06
f03955d
a9e6964
 
9c1cc06
a3c284e
9c1cc06
 
 
 
 
 
bf8477e
a3c284e
9c1cc06
a3c284e
9c1cc06
a3c284e
9c1cc06
bf8477e
f03955d
 
9c1cc06
 
 
a9e6964
 
9c1cc06
1ce31e1
4d833d7
4827b54
9c1cc06
5ed2c98
9c1cc06
4d833d7
 
 
9c1cc06
bf8477e
1ce31e1
9c1cc06
 
a3c284e
a9e6964
9c1cc06
 
 
 
bf8477e
a3c284e
9c1cc06
a9e6964
1ce31e1
 
 
 
d2f0972
 
1ce31e1
 
 
97627fd
9c1cc06
 
d2f0972
 
 
4d833d7
5ed2c98
4d833d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4827b54
4d833d7
 
5ed2c98
97627fd
 
1ce31e1
4827b54
 
 
1ce31e1
 
9c1cc06
d2f0972
9c1cc06
f03955d
a9e6964
 
 
 
 
 
 
 
f03955d
a3c284e
a9e6964
f03955d
eb0691b
f03955d
eb0691b
a9e6964
 
 
 
 
5ea9e86
 
a9e6964
 
5ea9e86
 
5ed2c98
a9e6964
 
 
a3c284e
 
 
 
 
a9e6964
 
 
5ea9e86
 
 
f03955d
 
 
 
97627fd
120c013
f03955d
 
 
 
 
 
 
 
120c013
f03955d
 
 
 
 
 
 
 
 
 
180ea05
f03955d
 
 
 
 
5ea9e86
eb0691b
5ea9e86
a3c284e
 
 
 
 
eb0691b
 
a9e6964
5ea9e86
 
a9e6964
 
a3c284e
f03955d
a9e6964
180ea05
1ce31e1
 
 
a3c284e
180ea05
 
eb0691b
180ea05
a3c284e
5ea9e86

import gradio as gr
import requests
import json

class SynthIDApp:
    def __init__(self):
        self.api_url = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
        self.headers = None
        self.WATERMARK_KEYS = [654, 400, 836, 123, 340, 443, 597, 160, 57, 789]
    
    def login(self, hf_token):
        """Initialize the API headers with authentication."""
        try:
            self.headers = {"Authorization": f"Bearer {hf_token}"}
            
            # Test the connection with a simple query
            response = requests.post(
                self.api_url,
                headers=self.headers,
                json={"inputs": "Test", "parameters": {"max_new_tokens": 1}}
            )
            response.raise_for_status()
            
            return "API connection initialized successfully!"
        except Exception as e:
            self.headers = None
            return f"Error initializing API: {str(e)}"

    def apply_watermark(self, text, ngram_len):
        """Apply SynthID watermark to input text using the inference API."""
        if not self.headers:
            return text, "Error: API not initialized. Please login first."
            
        try:
            # Prepare the API request parameters
            # Prepare the API request parameters for watermarking
            prompt = f"<s>[INST] Return the exact same text, with watermark applied: {text} [/INST]"
            
            params = {
                "inputs": prompt,
                "parameters": {
                    "return_full_text": True,
                    "do_sample": False,     # Deterministic generation
                    "temperature": 0.01,    # Almost deterministic
                    "watermarking_config": {
                        "keys": self.WATERMARK_KEYS,
                        "ngram_len": int(ngram_len)
                    }
                }
            }
            
            # Make the API call
            response = requests.post(
                self.api_url,
                headers=self.headers,
                json=params
            )
            response.raise_for_status()
            
            # Make the API call
            response = requests.post(
                self.api_url,
                headers=self.headers,
                json=params,
                timeout=30  # Add timeout
            )
            response.raise_for_status()
            
            # Extract the watermarked text
            result = response.json()
            if isinstance(result, list) and len(result) > 0:
                if 'error' in result[0]:
                    return text, f"API Error: {result[0]['error']}"
                
                generated_text = result[0].get('generated_text', '').strip()
                
                # Extract only the response part after the instruction
                try:
                    # First try splitting on [/INST]
                    parts = generated_text.split("[/INST]")
                    if len(parts) > 1:
                        watermarked_text = parts[-1].strip()
                    else:
                        # If no [/INST], try finding the original text and take what follows
                        idx = generated_text.find(text)
                        if idx != -1:
                            watermarked_text = generated_text[idx + len(text):].strip()
                        else:
                            # If all else fails, take the whole text
                            watermarked_text = generated_text
                except Exception as e:
                    return text, f"Error processing response: {str(e)}"
                
                # Clean up the text
                watermarked_text = watermarked_text.strip(' .')
                
                if not watermarked_text:
                    return text, "Error: No watermarked text generated"
                
                # Add back the period if the original had one
                if text.strip().endswith('.'):
                    watermarked_text += '.'
                
                return watermarked_text, f"Watermark applied successfully! (ngram_len: {ngram_len})"
            else:
                return text, f"Error: Unexpected API response format: {str(result)}"
                
            return watermarked_text, f"Watermark applied successfully! (ngram_len: {ngram_len})"
        except Exception as e:
            return text, f"Error applying watermark: {str(e)}"

    def analyze_text(self, text):
        """Analyze text characteristics."""
        try:
            total_words = len(text.split())
            avg_word_length = sum(len(word) for word in text.split()) / total_words if total_words > 0 else 0
            char_count = len(text)
            
            analysis = f"""Text Analysis:
- Total characters: {char_count}
- Total words: {total_words}
- Average word length: {avg_word_length:.2f}

Note: This is a basic analysis. The official SynthID detector is not yet available in the public transformers package."""
            
            return analysis
        except Exception as e:
            return f"Error analyzing text: {str(e)}"

# Create Gradio interface
app_instance = SynthIDApp()

with gr.Blocks(title="SynthID Text Watermarking Tool") as app:
    gr.Markdown("# SynthID Text Watermarking Tool")
    gr.Markdown("Using Mistral-7B-Instruct-v0.2 with Hugging Face Inference API")
    
    # Login section
    with gr.Row():
        hf_token = gr.Textbox(
            label="Enter Hugging Face Token", 
            type="password",
            placeholder="hf_..."
        )
        login_status = gr.Textbox(label="Login Status")
    login_btn = gr.Button("Login")
    login_btn.click(app_instance.login, inputs=[hf_token], outputs=[login_status])
    
    with gr.Tab("Apply Watermark"):
        with gr.Row():
            with gr.Column(scale=3):
                input_text = gr.Textbox(
                    label="Input Text", 
                    lines=5,
                    placeholder="Enter text to watermark...",
                    value="Test Sentence: WordLift is a cutting-edge platform designed to enhance your digital content by leveraging the power of semantic technology. It transforms your website into a structured repository of knowledge, making your content more discoverable, engaging, and aligned with modern search engine algorithms. By utilizing AI-driven entity extraction and knowledge graph generation, WordLift helps you bridge the gap between your content and search intent, ensuring optimal visibility and performance."
                )
                output_text = gr.Textbox(label="Watermarked Text", lines=5)
            with gr.Column(scale=1):
                ngram_len = gr.Slider(
                    label="N-gram Length",
                    minimum=2,
                    maximum=5,
                    step=1,
                    value=2,
                    info="Controls watermark detectability (2-5)"
                )
                status = gr.Textbox(label="Status")
        
        gr.Markdown("""
        ### N-gram Length Parameter:
        - Higher values (4-5): More detectable watermark, but more brittle to changes
        - Lower values (2-3): More robust to changes, but harder to detect
        - Default (5): Maximum detectability""")
        
        apply_btn = gr.Button("Apply Watermark")
        apply_btn.click(
            app_instance.apply_watermark, 
            inputs=[input_text, ngram_len], 
            outputs=[output_text, status]
        )
    
    with gr.Tab("Analyze Text"):
        with gr.Row():
            analyze_input = gr.Textbox(
                label="Text to Analyze", 
                lines=5,
                placeholder="Enter text to analyze..."
            )
            analyze_result = gr.Textbox(label="Analysis Result", lines=5)
        analyze_btn = gr.Button("Analyze Text")
        analyze_btn.click(app_instance.analyze_text, inputs=[analyze_input], outputs=[analyze_result])
    
    gr.Markdown("""
    ### Instructions:
    1. Enter your Hugging Face token and click Login
    2. Once connected, you can use the tabs to apply watermarks or analyze text
    3. Adjust the N-gram Length slider to control watermark characteristics
    
    ### Notes:
    - The watermarking process attempts to maintain the original meaning while adding the watermark
    - If you get unexpected results, try adjusting the n-gram length or slightly rephrasing your text
    - This is an experimental feature using the Inference API
    - No model download required - everything runs in the cloud
    - The watermark is designed to be imperceptible to humans
    - This demo only implements watermark application
    - The official detector will be available in future releases
    - For production use, use your own secure watermark keys
    - Your token is never stored and is only used for API access
    """)

# Launch the app
if __name__ == "__main__":
    app.launch()