File size: 7,795 Bytes
e433e13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import requests
import base64
import json
import mimetypes
import os
from huggingface_hub import HfApi
from pathlib import Path

GITHUB_API = "https://api.github.com/repos/"

def generate_file_tree(paths):
    """Generate a simple file tree from a list of paths."""
    tree = ["πŸ“ Root"]
    sorted_paths = sorted(paths)
    for path in sorted_paths:
        parts = path.split('/')
        indent = "  " * (len(parts) - 1)
        tree.append(f"{indent}πŸ“„ {parts[-1]}")
    return "\n".join(tree) + "\n\n"

def get_all_files(owner, repo, path="", is_hf=False):
    """Recursively fetch all files from a repository."""
    if is_hf:
        api_url = f"https://huggingface.co/api/spaces/{owner}/{repo}/tree/main/{path}".rstrip('/')
    else:
        api_url = f"{GITHUB_API}{owner}/{repo}/contents/{path}".rstrip('/')
    
    try:
        response = requests.get(api_url, headers={"Accept": "application/json"}, timeout=10)
        response.raise_for_status()
        items = response.json() if response.headers.get('Content-Type', '').startswith('application/json') else None
        if not items:
            return None
        
        files = []
        for item in items:
            if item.get('type') == 'file':
                files.append(item)
            elif item.get('type') == 'dir':
                sub_files = get_all_files(owner, repo, item['path'], is_hf)
                if sub_files:
                    files.extend(sub_files)
        return files
    except requests.exceptions.RequestException:
        return None

def get_hf_files(owner, repo):
    """Fetch all files from a Hugging Face Space."""
    api = HfApi(token=os.getenv('HF_TOKEN'))
    try:
        file_list = api.list_repo_files(repo_id=f'{owner}/{repo}', repo_type="space")
        processed_files = []
        for file_path in file_list:
            raw_url = f"https://huggingface.co/spaces/{owner}/{repo}/raw/main/{file_path}"
            response = requests.get(raw_url, timeout=10)
            response.raise_for_status()
            if not response.headers.get('Content-Type', '').startswith(('text/plain', 'application/octet-stream', 'text/')):
                continue
            processed_files.append({"path": file_path})
        return processed_files
    except Exception:
        return []

def get_repo_contents(url):
    """Parse URL and fetch repository contents."""
    try:
        if "huggingface.co" in url.lower():
            parts = url.rstrip('/').split('/')
            owner, repo = parts[-2], parts[-1]
            files = get_hf_files(owner, repo)
            if not files:
                raise Exception("No files found in the Hugging Face Space")
            return owner, repo, files, True
        else:
            parts = url.rstrip('/').split('/')
            owner, repo = parts[-2], parts[-1]
            files = get_all_files(owner, repo, "", False)
            if files is None:
                raise Exception("Failed to fetch GitHub repository contents")
            return owner, repo, files, False
    except Exception as e:
        return None, None, f"Error fetching repo contents: {str(e)}", False

def process_file_content(file_info, owner, repo, is_hf=False):
    """Process individual file content from a repository."""
    content = ""
    file_path = file_info['path']
    
    try:
        if is_hf:
            file_url = f"https://huggingface.co/spaces/{owner}/{repo}/raw/main/{file_path}"
            response = requests.get(file_url, timeout=10)
            response.raise_for_status()
            content_raw = response.content
        else:
            file_url = f"{GITHUB_API}{owner}/{repo}/contents/{file_path}"
            response = requests.get(file_url, headers={"Accept": "application/json"}, timeout=10)
            response.raise_for_status()
            data = response.json()
            content_raw = base64.b64decode(data['content']) if 'content' in data else b""
        
        size = len(content_raw)
        file_extension = file_path.split('.')[-1] if '.' in file_path else ''
        mime_type, _ = mimetypes.guess_type(file_path)
        is_text = (mime_type and mime_type.startswith('text')) or file_extension in ['py', 'md', 'txt', 'js', 'html', 'css', 'json'] or "Dockerfile" in file_path
        
        if is_text:
            text_content = content_raw.decode('utf-8')
            if file_extension == 'json':
                try:
                    json_data = json.loads(text_content)
                    formatted_json = json.dumps(json_data, indent=2)
                    content = f"### File: {file_path}\n```json\n{formatted_json}\n```\n\n"
                except json.JSONDecodeError:
                    content = f"### File: {file_path}\n```json\n{text_content}\n```\n[Note: Invalid JSON format]\n\n"
            else:
                content = f"### File: {file_path}\n```{file_extension or 'text'}\n{text_content}\n```\n\n"
        else:
            content = f"### File: {file_path}\n[Binary file - {size} bytes]\n\n"
    except Exception as e:
        content = f"### File: {file_path}\n[Error fetching file content: {str(e)}]\n\n"
    
    return content

def process_uploaded_file(file):
    """Process uploaded file content (expects a file-like object with read() method)."""
    content = ""
    filename = getattr(file, 'filename', 'unknown')
    file_extension = filename.split('.')[-1] if '.' in filename else ''
    
    try:
        content_raw = file.read()
        size = len(content_raw)
        mime_type, _ = mimetypes.guess_type(filename)
        is_text = (mime_type and mime_type.startswith('text')) or file_extension in ['py', 'md', 'txt', 'js', 'html', 'css', 'json']
        
        if is_text:
            text_content = content_raw.decode('utf-8')
            if file_extension == 'json':
                try:
                    json_data = json.loads(text_content)
                    formatted_json = json.dumps(json_data, indent=2)
                    content = f"### File: {filename}\n```json\n{formatted_json}\n```\n\n"
                except json.JSONDecodeError:
                    content = f"### File: {filename}\n```json\n{text_content}\n```\n[Note: Invalid JSON format]\n\n"
            else:
                content = f"### File: {filename}\n```{file_extension or 'text'}\n{text_content}\n```\n\n"
        else:
            content = f"### File: {filename}\n[Binary file - {size} bytes]\n\n"
    except Exception as e:
        content = f"### File: {filename}\n[Error processing file: {str(e)}]\n\n"
    
    return content

def create_markdown_document(url=None, files=None):
    """Create markdown document from repo contents or uploaded files."""
    if url:
        owner, repo, contents, is_hf = get_repo_contents(url)
        if isinstance(contents, str):  # Error case
            return f"Error: {contents}"
        
        markdown_content = f"# {'Space' if is_hf else 'Repository'}: {owner}/{repo}\n\n"
        markdown_content += "## File Structure\n```\n"
        markdown_content += generate_file_tree([item['path'] for item in contents])
        markdown_content += "```\n\n"
        markdown_content += f"Below are the contents of all files in the {'space' if is_hf else 'repository'}:\n\n"
        
        for item in contents:
            markdown_content += process_file_content(item, owner, repo, is_hf)
    else:
        markdown_content = "# Uploaded Files\n\n"
        markdown_content += "## File Structure\n```\n"
        markdown_content += generate_file_tree([file.filename for file in files])
        markdown_content += "```\n\n"
        markdown_content += "Below are the contents of all uploaded files:\n\n"
        for file in files:
            markdown_content += process_uploaded_file(file)
    
    return markdown_content