Spaces:

dwb2023
/

hf_extractor

Running on Zero

File size: 4,663 Bytes

98d1d12
 
4006c1a
0f701bd
5e040c2
 
c700267
def12f5
c700267
 
 
def12f5
c700267
 
 
 
 
4006c1a
e33200c
 
 
 
 
c700267
98d1d12
 
c700267
 
e361a15
98d1d12
 
 
 
1ca0012
98d1d12
4006c1a
98d1d12
 
 
4006c1a
 
98d1d12
6be117b
98d1d12
 
0f701bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c700267
e33200c
 
 
98d1d12
 
 
4006c1a
c700267
98d1d12
e33200c
4006c1a
0f701bd
4006c1a
0f701bd
1ca0012
0f701bd
 
8f97bb1
0f701bd
 
 
 
 
 
 
 
4006c1a
e33200c
 
 
4006c1a
 
fce2161
 
4006c1a
bbd42f8
 
 
 
 
 
 
 
4006c1a
 
 
c700267
fce2161
4006c1a
 
 
 
 
 
 
0e324a0
4006c1a

import os
import subprocess
import gradio as gr
from magika import Magika
from huggingface_hub import login

# Get the HF token and space author name from environment variables
hf_token = os.getenv("HF_TOKEN")
hf_user = os.getenv("SPACE_AUTHOR_NAME")

if not hf_token:
    raise ValueError("HF_TOKEN environment variable is not set")
if not hf_user:
    raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")

# Perform login using the token
login(token=hf_token, add_to_git_credential=True)

SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]

def validate_url(url):
    return url.startswith('https://')

def clone_repo(url, repo_dir, hf_token, hf_user):
    env = os.environ.copy()
    env['GIT_LFS_SKIP_SMUDGE'] = '1'
    # Construct the Git URL with the token and author name for authentication
    token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
    result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
    if result.returncode != 0:
        return False, result.stderr
    return True, None

def get_file_summary(file_path, file_type):
    size = os.path.getsize(file_path)
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
    }

def read_file_content(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        return file.read()

def validate_file_types(directory):
    m = Magika()
    file_types = {}
    for root, _, files in os.walk(directory):
        if '.git' in root:
            continue
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'rb') as file:
                    file_bytes = file.read()
                result = m.identify_bytes(file_bytes)
                file_types[file_path] = result.output.ct_label
            except Exception as e:
                file_types[file_path] = f"Error: {str(e)}"
    return file_types

def extract_repo_content(url, hf_token, hf_user):
    if not validate_url(url):
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
    
    repo_dir = "./temp_repo"
    if os.path.exists(repo_dir):
        subprocess.run(["rm", "-rf", repo_dir])
    
    success, error = clone_repo(url, repo_dir, hf_token, hf_user)
    if not success:
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
    
    file_types = validate_file_types(repo_dir)
    extracted_content = []
    for file_path, file_type in file_types.items():
        file_summary = get_file_summary(file_path, file_type)
        content = {"header": file_summary}
        
        if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 256 * 1024:
            try:
                content["content"] = read_file_content(file_path)
            except Exception as e:
                content["content"] = f"Failed to read file content: {str(e)}"
        else:
            content["content"] = "File too large or binary, content not captured."
        
        extracted_content.append(content)
    
    # Cleanup temporary directory
    subprocess.run(["rm", "-rf", repo_dir])
    
    return extracted_content

def format_output(extracted_content, repo_url):
    formatted_output = f"# Repository URL: {repo_url}\n\n"
    for file_data in extracted_content:
        if isinstance(file_data, dict) and 'header' in file_data:
            formatted_output += f"### File: {file_data['header']['name']}\n"
            formatted_output += f"**Type:** {file_data['header']['type']}\n"
            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
            formatted_output += "#### Content:\n"
            formatted_output += f"```\n{file_data['content']}\n```\n\n"
        else:
            formatted_output += "Error in file data format.\n"
    return formatted_output

def extract_and_display(url):
    extracted_content = extract_repo_content(url, hf_token, hf_user)
    formatted_output = format_output(extracted_content, url)
    return formatted_output

app = gr.Blocks()

with app:
    gr.Markdown("# Gradio Space/Model Content Extractor")
    url_input = gr.Textbox(label="Hugging Face Space/Model URL")
    output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
    extract_button = gr.Button("Extract Content")
    
    extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

app.launch()