Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,949 Bytes
98d1d12 4006c1a 0f701bd 4006c1a e33200c 98d1d12 1ca0012 98d1d12 4006c1a 98d1d12 4006c1a 98d1d12 6be117b 98d1d12 0f701bd 4006c1a e33200c 98d1d12 4006c1a 98d1d12 e33200c 4006c1a 0f701bd 4006c1a 0f701bd 1ca0012 0f701bd e33200c 0f701bd 4006c1a e33200c 4006c1a bbd42f8 4006c1a 0e324a0 4006c1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import os
import subprocess
import gradio as gr
from magika import Magika
SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]
def validate_url(url):
return url.startswith('https://')
def clone_repo(url, repo_dir):
env = os.environ.copy()
env['GIT_LFS_SKIP_SMUDGE'] = '1'
result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
if result.returncode != 0:
return False, result.stderr
return True, None
def get_file_summary(file_path, file_type):
size = os.path.getsize(file_path)
return {
"name": os.path.relpath(file_path),
"type": file_type,
"size": size,
}
def read_file_content(file_path):
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
return file.read()
def validate_file_types(directory):
m = Magika()
file_types = {}
for root, _, files in os.walk(directory):
if '.git' in root:
continue
for file_name in files:
file_path = os.path.join(root, file_name)
try:
with open(file_path, 'rb') as file:
file_bytes = file.read()
result = m.identify_bytes(file_bytes)
file_types[file_path] = result.output.ct_label
except Exception as e:
file_types[file_path] = f"Error: {str(e)}"
return file_types
def extract_repo_content(url):
if not validate_url(url):
return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
repo_dir = "./temp_repo"
if os.path.exists(repo_dir):
subprocess.run(["rm", "-rf", repo_dir])
success, error = clone_repo(url, repo_dir)
if not success:
return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
file_types = validate_file_types(repo_dir)
extracted_content = []
for file_path, file_type in file_types.items():
file_summary = get_file_summary(file_path, file_type)
content = {"header": file_summary}
if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 1024 * 1024:
try:
content["content"] = read_file_content(file_path)
except Exception as e:
content["content"] = f"Failed to read file content: {str(e)}"
else:
content["content"] = "File too large or binary, content not captured."
extracted_content.append(content)
# Cleanup temporary directory
subprocess.run(["rm", "-rf", repo_dir])
return extracted_content
def format_output(extracted_content):
formatted_output = ""
for file_data in extracted_content:
if isinstance(file_data, dict) and 'header' in file_data:
formatted_output += f"### File: {file_data['header']['name']}\n"
formatted_output += f"**Type:** {file_data['header']['type']}\n"
formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
formatted_output += "#### Content:\n"
formatted_output += f"```\n{file_data['content']}\n```\n\n"
else:
formatted_output += "Error in file data format.\n"
return formatted_output
def extract_and_display(url):
extracted_content = extract_repo_content(url)
formatted_output = format_output(extracted_content)
return formatted_output
app = gr.Blocks()
with app:
gr.Markdown("# Gradio Space/Model Content Extractor")
url_input = gr.Textbox(label="Hugging Face Space/Model URL")
output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
extract_button = gr.Button("Extract Content")
extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
app.launch()
|