Spaces:

dwb2023
/

hf_extractor

Running on Zero

File size: 3,997 Bytes

98d1d12
 
4006c1a
0f701bd
4006c1a
e33200c
 
 
 
 
98d1d12
 
 
 
 
 
 
 
1ca0012
98d1d12
4006c1a
98d1d12
 
 
4006c1a
 
98d1d12
6be117b
98d1d12
 
0f701bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4006c1a
e33200c
 
 
98d1d12
 
 
4006c1a
98d1d12
 
e33200c
4006c1a
0f701bd
4006c1a
0f701bd
1ca0012
0f701bd
 
e33200c
0f701bd
 
 
 
 
 
 
 
4006c1a
e33200c
 
 
4006c1a
 
fce2161
 
4006c1a
bbd42f8
 
 
 
 
 
 
 
4006c1a
 
 
 
fce2161
4006c1a
 
 
 
 
 
 
0e324a0
4006c1a

import os
import subprocess
import gradio as gr
from magika import Magika

SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]

def validate_url(url):
    return url.startswith('https://')

def clone_repo(url, repo_dir):
    env = os.environ.copy()
    env['GIT_LFS_SKIP_SMUDGE'] = '1'
    result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
    if result.returncode != 0:
        return False, result.stderr
    return True, None

def get_file_summary(file_path, file_type):
    size = os.path.getsize(file_path)
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
    }

def read_file_content(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        return file.read()

def validate_file_types(directory):
    m = Magika()
    file_types = {}
    for root, _, files in os.walk(directory):
        if '.git' in root:
            continue
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'rb') as file:
                    file_bytes = file.read()
                result = m.identify_bytes(file_bytes)
                file_types[file_path] = result.output.ct_label
            except Exception as e:
                file_types[file_path] = f"Error: {str(e)}"
    return file_types

def extract_repo_content(url):
    if not validate_url(url):
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
    
    repo_dir = "./temp_repo"
    if os.path.exists(repo_dir):
        subprocess.run(["rm", "-rf", repo_dir])
    
    success, error = clone_repo(url, repo_dir)
    if not success:
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
    
    file_types = validate_file_types(repo_dir)
    extracted_content = []
    for file_path, file_type in file_types.items():
        file_summary = get_file_summary(file_path, file_type)
        content = {"header": file_summary}
        
        if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 1024 * 1024:
            try:
                content["content"] = read_file_content(file_path)
            except Exception as e:
                content["content"] = f"Failed to read file content: {str(e)}"
        else:
            content["content"] = "File too large or binary, content not captured."
        
        extracted_content.append(content)
    
    # Cleanup temporary directory
    subprocess.run(["rm", "-rf", repo_dir])
    
    return extracted_content

def format_output(extracted_content, repo_url):
    formatted_output = f"# Repository URL: {repo_url}\n\n"
    for file_data in extracted_content:
        if isinstance(file_data, dict) and 'header' in file_data:
            formatted_output += f"### File: {file_data['header']['name']}\n"
            formatted_output += f"**Type:** {file_data['header']['type']}\n"
            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
            formatted_output += "#### Content:\n"
            formatted_output += f"```\n{file_data['content']}\n```\n\n"
        else:
            formatted_output += "Error in file data format.\n"
    return formatted_output

def extract_and_display(url):
    extracted_content = extract_repo_content(url)
    formatted_output = format_output(extracted_content, url)
    return formatted_output

app = gr.Blocks()

with app:
    gr.Markdown("# Gradio Space/Model Content Extractor")
    url_input = gr.Textbox(label="Hugging Face Space/Model URL")
    output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
    extract_button = gr.Button("Extract Content")
    
    extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

app.launch()