File size: 4,996 Bytes
dbc6e65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import sys
import requests
import zipfile
import io
import ast


def is_file_type(file_path, file_extension):
    """Check if the file has the specified file extension."""
    return file_path.endswith(file_extension)


def is_likely_useful_file(file_path, lang="python"):
    """Determine if the file is likely to be useful by excluding certain directories and specific file types."""
    excluded_dirs = ["docs", "examples", "tests", "test", "scripts", "utils", "benchmarks"]
    utility_or_config_files = []
    github_workflow_or_docs = [".github", ".gitignore", "LICENSE"]

    if lang == "python":
        excluded_dirs.append("__pycache__")
        utility_or_config_files.extend(["hubconf.py", "setup.py"])
        github_workflow_or_docs.extend(["stale.py", "gen-card-", "write_model_card"])
    elif lang == "go":
        excluded_dirs.append("vendor")
        utility_or_config_files.extend(["go.mod", "go.sum", "Makefile"])

    if any(part.startswith(".") for part in file_path.split("/")):
        return False
    if "test" in file_path.lower():
        return False
    for excluded_dir in excluded_dirs:
        if f"/{excluded_dir}/" in file_path or file_path.startswith(excluded_dir + "/"):
            return False
    for file_name in utility_or_config_files:
        if file_name in file_path:
            return False
    for doc_file in github_workflow_or_docs:
        if doc_file in file_path:
            return False
    return True


def is_test_file(file_content, lang):
    """Determine if the file content suggests it is a test file."""
    test_indicators = {"python": ["unittest", "pytest"], "go": ["testing"]}.get(lang, [])

    if lang == "python":
        try:
            module = ast.parse(file_content)
            for node in ast.walk(module):
                if isinstance(node, ast.Import):
                    for alias in node.names:
                        if alias.name in test_indicators:
                            return True
                elif isinstance(node, ast.ImportFrom):
                    if node.module in test_indicators:
                        return True
        except SyntaxError:
            pass

    return False


def has_sufficient_content(file_content, min_line_count=10):
    """Check if the file has a minimum number of substantive lines."""
    lines = [line for line in file_content.split("\n") if line.strip() and not line.strip().startswith(("#", "//"))]
    return len(lines) >= min_line_count


def remove_comments_and_docstrings(source):
    """Remove comments and docstrings from the Python source code."""
    tree = ast.parse(source)
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)) and ast.get_docstring(node):
            node.body = node.body[1:]  # Remove docstring
        elif isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
            node.value.s = ""  # Remove comments
    return ast.unparse(tree)


def download_repo(repo_url, branch_or_tag="master"):
    """Download and process files from a GitHub repository."""
    download_url = f"{repo_url}/archive/refs/heads/{branch_or_tag}.zip"
    lang = "python"

    print(download_url)
    response = requests.get(download_url)

    if response.status_code == 200:
        zip_file = zipfile.ZipFile(io.BytesIO(response.content))
        file_contents = ""
        print(zip_file.namelist())
        for file_path in zip_file.namelist():
            # Skip directories, non-language files, less likely useful files, hidden directories, and test files
            if file_path.endswith("/") or not is_file_type(file_path, ".py") or not is_likely_useful_file(file_path):
                print("Dir or non-lang or useless:", file_path)
                continue
            file_content = zip_file.read(file_path).decode("utf-8")

            # Skip test files based on content
            if is_test_file(file_content, lang):
                print("Test file:", file_path)
                continue
            print("Appending", file_path)

            file_contents += f"// File: {file_path}\n" if lang == "go" else f"# File: {file_path}\n"
            file_contents += file_content
            file_contents += "\n\n"
        return file_contents
    else:
        print(f"Failed to download the repository. Status code: {response.status_code}")
        sys.exit(1)


def download_and_process(repo_url, branch_or_tag="master"):
    file_contents = download_repo(repo_url, branch_or_tag)
    return file_contents


iface = gr.Interface(
    fn=download_and_process,
    inputs=[
        gr.components.Textbox(label="GitHub Repository URL", value="https://github.com/cognitivecomputations/github2file"),
        gr.components.Textbox(label="Branch or Tag", value="master"),
    ],
    outputs=gr.components.Code(
        label="Output File",
        language="python",
        interactive=True,
    ),
)

if __name__ == "__main__":
    iface.launch()