dwb2023 commited on
Commit
0f701bd
1 Parent(s): dc7c719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -17
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import subprocess
3
  import gradio as gr
 
4
 
5
  def clone_repo(url, repo_dir):
6
  env = os.environ.copy()
@@ -23,6 +24,23 @@ def read_file_content(file_path):
23
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
24
  return file.read()
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def extract_repo_content(url):
27
  repo_dir = "./temp_repo"
28
  if os.path.exists(repo_dir):
@@ -32,24 +50,21 @@ def extract_repo_content(url):
32
  if not success:
33
  return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": error}]
34
 
 
35
  extracted_content = []
36
- for root, _, files in os.walk(repo_dir):
37
- if '.git' in root:
38
- continue # Skip the .git directory
39
- for file in files:
40
- file_path = os.path.join(root, file)
41
- file_summary = get_file_summary(file_path)
42
- content = {"header": file_summary}
43
-
44
- if file_summary["type"] == "text" and file_summary["size"] <= 1024 * 1024:
45
- try:
46
- content["content"] = read_file_content(file_path)
47
- except Exception as e:
48
- content["content"] = f"Failed to read file content: {str(e)}"
49
- else:
50
- content["content"] = "File too large or binary, content not captured."
51
-
52
- extracted_content.append(content)
53
 
54
  return extracted_content
55
 
 
1
  import os
2
  import subprocess
3
  import gradio as gr
4
+ from magika import Magika
5
 
6
  def clone_repo(url, repo_dir):
7
  env = os.environ.copy()
 
24
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
25
  return file.read()
26
 
27
+ def validate_file_types(directory):
28
+ m = Magika()
29
+ file_types = {}
30
+ for root, _, files in os.walk(directory):
31
+ if '.git' in root:
32
+ continue
33
+ for file_name in files:
34
+ file_path = os.path.join(root, file_name)
35
+ try:
36
+ with open(file_path, 'rb') as file:
37
+ file_bytes = file.read()
38
+ result = m.identify_bytes(file_bytes)
39
+ file_types[file_path] = result.output.ct_label
40
+ except Exception as e:
41
+ file_types[file_path] = f"Error: {str(e)}"
42
+ return file_types
43
+
44
  def extract_repo_content(url):
45
  repo_dir = "./temp_repo"
46
  if os.path.exists(repo_dir):
 
50
  if not success:
51
  return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": error}]
52
 
53
+ file_types = validate_file_types(repo_dir)
54
  extracted_content = []
55
+ for file_path, file_type in file_types.items():
56
+ file_summary = get_file_summary(file_path)
57
+ content = {"header": file_summary}
58
+
59
+ if file_type.startswith("text") and file_summary["size"] <= 1024 * 1024:
60
+ try:
61
+ content["content"] = read_file_content(file_path)
62
+ except Exception as e:
63
+ content["content"] = f"Failed to read file content: {str(e)}"
64
+ else:
65
+ content["content"] = "File too large or binary, content not captured."
66
+
67
+ extracted_content.append(content)
 
 
 
 
68
 
69
  return extracted_content
70