Spaces:

dwb2023
/

hf_extractor

Running on Zero

App Files Files Community

dwb2023 commited on Jun 15, 2024

Commit

c881cad

verified ·

1 Parent(s): 56dbc6b

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -108

app.py CHANGED Viewed

@@ -1,113 +1,8 @@
-import os
-import subprocess
 import gradio as gr
-from magika import Magika
-from huggingface_hub import login
-# Get the HF token and space author name from environment variables
-hf_token = os.getenv("HF_TOKEN")
-hf_user = os.getenv("SPACE_AUTHOR_NAME")
-if not hf_token:
-    raise ValueError("HF_TOKEN environment variable is not set")
-if not hf_user:
-    raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
-SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
-def validate_url(url):
-    return url.startswith('https://')
-def clone_repo(url, repo_dir, hf_token, hf_user):
-    env = os.environ.copy()
-    env['GIT_LFS_SKIP_SMUDGE'] = '1'
-    token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
-    result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
-    if result.returncode != 0:
-        return False, result.stderr
-    return True, None
-def get_file_summary(file_path, file_type):
-    size = os.path.getsize(file_path)
-    return {
-        "name": os.path.relpath(file_path),
-        "type": file_type,
-        "size": size,
-        "creation_date": os.path.getctime(file_path),
-        "modification_date": os.path.getmtime(file_path)
-    }
-def read_file_content(file_path, max_size=32*1024):
-    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
-        if os.path.getsize(file_path) > max_size:
-            return file.read(max_size) + "\n... [Content Truncated] ..."
-        else:
-            return file.read()
-def validate_file_types(directory):
-    m = Magika()
-    file_types = {}
-    for root, _, files in os.walk(directory):
-        if '.git' in root:
-            continue
-        for file_name in files:
-            file_path = os.path.join(root, file_name)
-            try:
-                with open(file_path, 'rb') as file:
-                    file_bytes = file.read()
-                result = m.identify_bytes(file_bytes)
-                file_types[file_path] = result.output.ct_label
-            except Exception as e:
-                file_types[file_path] = f"Error: {str(e)}"
-    return file_types
-def extract_repo_content(url, hf_token, hf_user):
-    if not validate_url(url):
-        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
-    repo_dir = "./temp_repo"
-    if os.path.exists(repo_dir):
-        subprocess.run(["rm", "-rf", repo_dir])
-    success, error = clone_repo(url, repo_dir, hf_token, hf_user)
-    if not success:
-        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
-    file_types = validate_file_types(repo_dir)
-    extracted_content = []
-    for file_path, file_type in file_types.items():
-        file_summary = get_file_summary(file_path, file_type)
-        content = {"header": file_summary}
-        if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
-            try:
-                content["content"] = read_file_content(file_path)
-            except Exception as e:
-                content["content"] = f"Failed to read file content: {str(e)}"
-        else:
-            content["content"] = "File too large or binary, content not captured."
-        extracted_content.append(content)
-    subprocess.run(["rm", "-rf", repo_dir])
-    return extracted_content
-def format_output(extracted_content, repo_url):
-    formatted_output = f"# Repository URL: {repo_url}\n\n"
-    for file_data in extracted_content:
-        if isinstance(file_data, dict) and 'header' in file_data:
-            formatted_output += f"### File: {file_data['header']['name']}\n"
-            formatted_output += f"**Type:** {file_data['header']['type']}\n"
-            formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
-            formatted_output += f"**Created:** {file_data['header']['creation_date']}\n"
-            formatted_output += f"**Modified:** {file_data['header']['modification_date']}\n"
-            formatted_output += "#### Content:\n"
-            formatted_output += f"```\n{file_data['content']}\n```\n\n"
-        else:
-            formatted_output += "Error in file data format.\n"
-    return formatted_output
 def extract_and_display(url):
     extracted_content = extract_repo_content(url, hf_token, hf_user)
     formatted_output = format_output(extracted_content, url)

 import gradio as gr
+from repo_utils import extract_repo_content
+from display_utils import format_output
+# Extract and display function
 def extract_and_display(url):
     extracted_content = extract_repo_content(url, hf_token, hf_user)
     formatted_output = format_output(extracted_content, url)