Spaces:
Running
Running
File size: 7,795 Bytes
e433e13 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import requests
import base64
import json
import mimetypes
import os
from huggingface_hub import HfApi
from pathlib import Path
GITHUB_API = "https://api.github.com/repos/"
def generate_file_tree(paths):
"""Generate a simple file tree from a list of paths."""
tree = ["π Root"]
sorted_paths = sorted(paths)
for path in sorted_paths:
parts = path.split('/')
indent = " " * (len(parts) - 1)
tree.append(f"{indent}π {parts[-1]}")
return "\n".join(tree) + "\n\n"
def get_all_files(owner, repo, path="", is_hf=False):
"""Recursively fetch all files from a repository."""
if is_hf:
api_url = f"https://huggingface.co/api/spaces/{owner}/{repo}/tree/main/{path}".rstrip('/')
else:
api_url = f"{GITHUB_API}{owner}/{repo}/contents/{path}".rstrip('/')
try:
response = requests.get(api_url, headers={"Accept": "application/json"}, timeout=10)
response.raise_for_status()
items = response.json() if response.headers.get('Content-Type', '').startswith('application/json') else None
if not items:
return None
files = []
for item in items:
if item.get('type') == 'file':
files.append(item)
elif item.get('type') == 'dir':
sub_files = get_all_files(owner, repo, item['path'], is_hf)
if sub_files:
files.extend(sub_files)
return files
except requests.exceptions.RequestException:
return None
def get_hf_files(owner, repo):
"""Fetch all files from a Hugging Face Space."""
api = HfApi(token=os.getenv('HF_TOKEN'))
try:
file_list = api.list_repo_files(repo_id=f'{owner}/{repo}', repo_type="space")
processed_files = []
for file_path in file_list:
raw_url = f"https://huggingface.co/spaces/{owner}/{repo}/raw/main/{file_path}"
response = requests.get(raw_url, timeout=10)
response.raise_for_status()
if not response.headers.get('Content-Type', '').startswith(('text/plain', 'application/octet-stream', 'text/')):
continue
processed_files.append({"path": file_path})
return processed_files
except Exception:
return []
def get_repo_contents(url):
"""Parse URL and fetch repository contents."""
try:
if "huggingface.co" in url.lower():
parts = url.rstrip('/').split('/')
owner, repo = parts[-2], parts[-1]
files = get_hf_files(owner, repo)
if not files:
raise Exception("No files found in the Hugging Face Space")
return owner, repo, files, True
else:
parts = url.rstrip('/').split('/')
owner, repo = parts[-2], parts[-1]
files = get_all_files(owner, repo, "", False)
if files is None:
raise Exception("Failed to fetch GitHub repository contents")
return owner, repo, files, False
except Exception as e:
return None, None, f"Error fetching repo contents: {str(e)}", False
def process_file_content(file_info, owner, repo, is_hf=False):
"""Process individual file content from a repository."""
content = ""
file_path = file_info['path']
try:
if is_hf:
file_url = f"https://huggingface.co/spaces/{owner}/{repo}/raw/main/{file_path}"
response = requests.get(file_url, timeout=10)
response.raise_for_status()
content_raw = response.content
else:
file_url = f"{GITHUB_API}{owner}/{repo}/contents/{file_path}"
response = requests.get(file_url, headers={"Accept": "application/json"}, timeout=10)
response.raise_for_status()
data = response.json()
content_raw = base64.b64decode(data['content']) if 'content' in data else b""
size = len(content_raw)
file_extension = file_path.split('.')[-1] if '.' in file_path else ''
mime_type, _ = mimetypes.guess_type(file_path)
is_text = (mime_type and mime_type.startswith('text')) or file_extension in ['py', 'md', 'txt', 'js', 'html', 'css', 'json'] or "Dockerfile" in file_path
if is_text:
text_content = content_raw.decode('utf-8')
if file_extension == 'json':
try:
json_data = json.loads(text_content)
formatted_json = json.dumps(json_data, indent=2)
content = f"### File: {file_path}\n```json\n{formatted_json}\n```\n\n"
except json.JSONDecodeError:
content = f"### File: {file_path}\n```json\n{text_content}\n```\n[Note: Invalid JSON format]\n\n"
else:
content = f"### File: {file_path}\n```{file_extension or 'text'}\n{text_content}\n```\n\n"
else:
content = f"### File: {file_path}\n[Binary file - {size} bytes]\n\n"
except Exception as e:
content = f"### File: {file_path}\n[Error fetching file content: {str(e)}]\n\n"
return content
def process_uploaded_file(file):
"""Process uploaded file content (expects a file-like object with read() method)."""
content = ""
filename = getattr(file, 'filename', 'unknown')
file_extension = filename.split('.')[-1] if '.' in filename else ''
try:
content_raw = file.read()
size = len(content_raw)
mime_type, _ = mimetypes.guess_type(filename)
is_text = (mime_type and mime_type.startswith('text')) or file_extension in ['py', 'md', 'txt', 'js', 'html', 'css', 'json']
if is_text:
text_content = content_raw.decode('utf-8')
if file_extension == 'json':
try:
json_data = json.loads(text_content)
formatted_json = json.dumps(json_data, indent=2)
content = f"### File: {filename}\n```json\n{formatted_json}\n```\n\n"
except json.JSONDecodeError:
content = f"### File: {filename}\n```json\n{text_content}\n```\n[Note: Invalid JSON format]\n\n"
else:
content = f"### File: {filename}\n```{file_extension or 'text'}\n{text_content}\n```\n\n"
else:
content = f"### File: {filename}\n[Binary file - {size} bytes]\n\n"
except Exception as e:
content = f"### File: {filename}\n[Error processing file: {str(e)}]\n\n"
return content
def create_markdown_document(url=None, files=None):
"""Create markdown document from repo contents or uploaded files."""
if url:
owner, repo, contents, is_hf = get_repo_contents(url)
if isinstance(contents, str): # Error case
return f"Error: {contents}"
markdown_content = f"# {'Space' if is_hf else 'Repository'}: {owner}/{repo}\n\n"
markdown_content += "## File Structure\n```\n"
markdown_content += generate_file_tree([item['path'] for item in contents])
markdown_content += "```\n\n"
markdown_content += f"Below are the contents of all files in the {'space' if is_hf else 'repository'}:\n\n"
for item in contents:
markdown_content += process_file_content(item, owner, repo, is_hf)
else:
markdown_content = "# Uploaded Files\n\n"
markdown_content += "## File Structure\n```\n"
markdown_content += generate_file_tree([file.filename for file in files])
markdown_content += "```\n\n"
markdown_content += "Below are the contents of all uploaded files:\n\n"
for file in files:
markdown_content += process_uploaded_file(file)
return markdown_content |