Spaces:

dwb2023
/

hf_extractor

Running on Zero

File size: 5,413 Bytes

98d1d12
 
4006c1a
0f701bd
5e040c2
 
c700267
def12f5
c700267
 
 
def12f5
c700267
 
 
 
6fc5944
4006c1a
e33200c
 
 
 
 
c700267
98d1d12
 
c700267
 
e361a15
98d1d12
 
 
 
1ca0012
98d1d12
4006c1a
98d1d12
 
 
4006c1a
 
98d1d12
6be117b
98d1d12
 
0f701bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c700267
e33200c
 
 
98d1d12
 
 
4006c1a
c700267
98d1d12
e33200c
4006c1a
0f701bd
4006c1a
0f701bd
1ca0012
0f701bd
 
05bd01a
0f701bd
 
 
 
 
 
 
 
4006c1a
e33200c
 
 
4006c1a
 
fce2161
 
4006c1a
bbd42f8
 
 
 
 
 
 
 
4006c1a
 
 
c700267
fce2161
4006c1a
 
3bbe81b
4006c1a
 
994ffd2
8537faa
 
 
 
84ac83a
8537faa
 
 
 
 
 
4006c1a

import os
import subprocess
import gradio as gr
from magika import Magika
from huggingface_hub import login

# Get the HF token and space author name from environment variables
hf_token = os.getenv("HF_TOKEN")
hf_user = os.getenv("SPACE_AUTHOR_NAME")

if not hf_token:
    raise ValueError("HF_TOKEN environment variable is not set")
if not hf_user:
    raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")

# Perform login using the token
# login(token=hf_token, add_to_git_credential=True)

SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]

def validate_url(url):
    return url.startswith('https://')

def clone_repo(url, repo_dir, hf_token, hf_user):
    env = os.environ.copy()
    env['GIT_LFS_SKIP_SMUDGE'] = '1'
    # Construct the Git URL with the token and author name for authentication
    token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
    result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
    if result.returncode != 0:
        return False, result.stderr
    return True, None

def get_file_summary(file_path, file_type):
    size = os.path.getsize(file_path)
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
    }

def read_file_content(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        return file.read()

def validate_file_types(directory):
    m = Magika()
    file_types = {}
    for root, _, files in os.walk(directory):
        if '.git' in root:
            continue
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'rb') as file:
                    file_bytes = file.read()
                result = m.identify_bytes(file_bytes)
                file_types[file_path] = result.output.ct_label
            except Exception as e:
                file_types[file_path] = f"Error: {str(e)}"
    return file_types

def extract_repo_content(url, hf_token, hf_user):
    if not validate_url(url):
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
    
    repo_dir = "./temp_repo"
    if os.path.exists(repo_dir):
        subprocess.run(["rm", "-rf", repo_dir])
    
    success, error = clone_repo(url, repo_dir, hf_token, hf_user)
    if not success:
        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
    
    file_types = validate_file_types(repo_dir)
    extracted_content = []
    for file_path, file_type in file_types.items():
        file_summary = get_file_summary(file_path, file_type)
        content = {"header": file_summary}
        
        if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
            try:
                content["content"] = read_file_content(file_path)
            except Exception as e:
                content["content"] = f"Failed to read file content: {str(e)}"
        else:
            content["content"] = "File too large or binary, content not captured."
        
        extracted_content.append(content)
    
    # Cleanup temporary directory
    subprocess.run(["rm", "-rf", repo_dir])
    
    return extracted_content

def format_output(extracted_content, repo_url):
    formatted_output = f"# Repository URL: {repo_url}\n\n"
    for file_data in extracted_content:
        if isinstance(file_data, dict) and 'header' in file_data:
            formatted_output += f"### File: {file_data['header']['name']}\n"
            formatted_output += f"**Type:** {file_data['header']['type']}\n"
            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
            formatted_output += "#### Content:\n"
            formatted_output += f"```\n{file_data['content']}\n```\n\n"
        else:
            formatted_output += "Error in file data format.\n"
    return formatted_output

def extract_and_display(url):
    extracted_content = extract_repo_content(url, hf_token, hf_user)
    formatted_output = format_output(extracted_content, url)
    return formatted_output

app = gr.Blocks(theme="sudeepshouche/minimalist")

with app:
    gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
    url_input = gr.Textbox(label="https:// URL of Repository", placeholder="Enter the repository URL here OR select an example below...")
    url_examples = gr.Examples(
        examples=[
            ["https://huggingface.co/spaces/big-vision/paligemma-hf"],
            ["https://huggingface.co/google/paligemma-3b-mix-224"],
            ["https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"],
            ["https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"]
        ],
        inputs=url_input
    )
    output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than 32 kb\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. 🤖")
    extract_button = gr.Button("Extract Content")
    
    extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

app.launch()