Spaces:

dwb2023
/

hf_extractor

Running on Zero

File size: 3,949 Bytes

import os
import subprocess
import gradio as gr
from magika import Magika

SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]

def validate_url(url):
    return url.startswith('https://')

def clone_repo(url, repo_dir):
    env = os.environ.copy()
    env['GIT_LFS_SKIP_SMUDGE'] = '1'
    result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
    if result.returncode != 0:
        return False, result.stderr
    return True, None

def get_file_summary(file_path, file_type):
    size = os.path.getsize(file_path)
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
    }

def read_file_content(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        return file.read()

def validate_file_types(directory):
    m = Magika()
    file_types = {}
    for root, _, files in os.walk(directory):
        if '.git' in root:
            continue
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'rb') as file:
                    file_bytes = file.read()
                result = m.identify_bytes(file_bytes)
                file_types[file_path] = result.output.ct_label
            except Exception as e:
                file_types[file_path] = f"Error: {str(e)}"
    return file_types

def extract_repo_content(url):
    if not validate_url(url):
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
    
    repo_dir = "./temp_repo"
    if os.path.exists(repo_dir):
        subprocess.run(["rm", "-rf", repo_dir])
    
    success, error = clone_repo(url, repo_dir)
    if not success:
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
    
    file_types = validate_file_types(repo_dir)
    extracted_content = []
    for file_path, file_type in file_types.items():
        file_summary = get_file_summary(file_path, file_type)
        content = {"header": file_summary}
        
        if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 1024 * 1024:
            try:
                content["content"] = read_file_content(file_path)
            except Exception as e:
                content["content"] = f"Failed to read file content: {str(e)}"
        else:
            content["content"] = "File too large or binary, content not captured."
        
        extracted_content.append(content)
    
    # Cleanup temporary directory
    subprocess.run(["rm", "-rf", repo_dir])
    
    return extracted_content

def format_output(extracted_content):
    formatted_output = ""
    for file_data in extracted_content:
        if isinstance(file_data, dict) and 'header' in file_data:
            formatted_output += f"### File: {file_data['header']['name']}\n"
            formatted_output += f"**Type:** {file_data['header']['type']}\n"
            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
            formatted_output += "#### Content:\n"
            formatted_output += f"```\n{file_data['content']}\n```\n\n"
        else:
            formatted_output += "Error in file data format.\n"
    return formatted_output

def extract_and_display(url):
    extracted_content = extract_repo_content(url)
    formatted_output = format_output(extracted_content)
    return formatted_output

app = gr.Blocks()

with app:
    gr.Markdown("# Gradio Space/Model Content Extractor")
    url_input = gr.Textbox(label="Hugging Face Space/Model URL")
    output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
    extract_button = gr.Button("Extract Content")
    
    extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

app.launch()