dwb2023 commited on
Commit
bcc36e8
β€’
1 Parent(s): 6669cbd

Update app.py

Browse files

rollback changes

Files changed (1) hide show
  1. app.py +43 -73
app.py CHANGED
@@ -1,19 +1,22 @@
1
  import os
2
  import subprocess
3
  import gradio as gr
4
- from tqdm import tqdm
5
- import chardet
6
- import logging
7
- import tempfile
8
- import concurrent.futures
9
 
10
- # Configure logging
11
- logging.basicConfig(level=logging.INFO)
12
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
13
 
14
- # Configurable supported file types and size limit
15
  SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]
16
- MAX_FILE_SIZE = 32 * 1024 # 32 KB
17
 
18
  def validate_url(url):
19
  return url.startswith('https://')
@@ -21,6 +24,7 @@ def validate_url(url):
21
  def clone_repo(url, repo_dir, hf_token, hf_user):
22
  env = os.environ.copy()
23
  env['GIT_LFS_SKIP_SMUDGE'] = '1'
 
24
  token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
25
  result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
26
  if result.returncode != 0:
@@ -36,21 +40,14 @@ def get_file_summary(file_path, file_type):
36
  }
37
 
38
  def read_file_content(file_path):
39
- with open(file_path, "rb") as file:
40
- file_bytes = file.read()
41
- encoding = chardet.detect(file_bytes)["encoding"]
42
- try:
43
- content = file_bytes.decode(encoding)
44
- return content
45
- except (UnicodeDecodeError, TypeError):
46
- return None
47
 
48
- def validate_file_types(directory, supported_file_types):
49
- from magika import Magika
50
  m = Magika()
51
  file_types = {}
52
  for root, _, files in os.walk(directory):
53
- if any(dir_name in root for dir_name in ['.git', '__pycache__']):
54
  continue
55
  for file_name in files:
56
  file_path = os.path.join(root, file_name)
@@ -58,55 +55,38 @@ def validate_file_types(directory, supported_file_types):
58
  with open(file_path, 'rb') as file:
59
  file_bytes = file.read()
60
  result = m.identify_bytes(file_bytes)
61
- file_type = result.output.ct_label
62
- if file_type not in supported_file_types:
63
- file_type = "Unsupported"
64
- file_types[file_path] = file_type
65
  except Exception as e:
66
  file_types[file_path] = f"Error: {str(e)}"
67
  return file_types
68
 
69
- def process_file(file_path, file_type, max_file_size):
70
- file_summary = get_file_summary(file_path, file_type)
71
- content = {"header": file_summary}
72
-
73
- if file_type != "Unsupported" and file_summary["size"] <= max_file_size:
74
- try:
75
- file_content = read_file_content(file_path)
76
- if file_content is not None:
77
- content["content"] = file_content
78
- else:
79
- content["content"] = "Failed to read file content: Unsupported encoding or binary file."
80
- except Exception as e:
81
- content["content"] = f"Failed to read file content: {str(e)}"
82
- else:
83
- content["content"] = f"Skipped: {'File size exceeds limit.' if file_summary['size'] > max_file_size else 'Unsupported file type.'}"
84
-
85
- return content
86
-
87
- def extract_repo_content(url, hf_token, hf_user, supported_file_types, max_file_size):
88
  if not validate_url(url):
89
  return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
90
 
91
- repo_dir = tempfile.mkdtemp(prefix="temp_repo_")
 
 
 
92
  success, error = clone_repo(url, repo_dir, hf_token, hf_user)
93
  if not success:
94
  return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
95
 
96
- file_types = validate_file_types(repo_dir, supported_file_types)
97
-
98
- with concurrent.futures.ThreadPoolExecutor() as executor:
99
- futures = []
100
- for file_path, file_type in file_types.items():
101
- future = executor.submit(process_file, file_path, file_type, max_file_size)
102
- futures.append(future)
103
 
104
- extracted_content = []
105
- with tqdm(total=len(futures), desc="Processing files") as progress_bar:
106
- for future in concurrent.futures.as_completed(futures):
107
- content = future.result()
108
- extracted_content.append(content)
109
- progress_bar.update(1)
 
 
 
110
 
111
  # Cleanup temporary directory
112
  subprocess.run(["rm", "-rf", repo_dir])
@@ -126,20 +106,12 @@ def format_output(extracted_content, repo_url):
126
  formatted_output += "Error in file data format.\n"
127
  return formatted_output
128
 
129
- def extract_and_display(url, supported_file_types, max_file_size):
130
- hf_token = os.getenv("HF_TOKEN")
131
- hf_user = os.getenv("SPACE_AUTHOR_NAME")
132
-
133
- if not hf_token:
134
- raise ValueError("HF_TOKEN environment variable is not set")
135
- if not hf_user:
136
- raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
137
-
138
- extracted_content = extract_repo_content(url, hf_token, hf_user, supported_file_types, max_file_size)
139
  formatted_output = format_output(extracted_content, url)
140
  return formatted_output
141
 
142
- app = gr.Blocks()
143
 
144
  with app:
145
  gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
@@ -153,11 +125,9 @@ with app:
153
  ],
154
  inputs=url_input
155
  )
156
- supported_file_types = gr.CheckboxGroup(SUPPORTED_FILE_TYPES, label="Supported File Types", info="Select the file types to include in the extraction.")
157
- max_file_size = gr.Slider(1, 1024, value=32, step=1, label="Max File Size (KB)", info="Files larger than this size will be skipped.")
158
- output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than the specified size limit.\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. πŸ€–")
159
  extract_button = gr.Button("Extract Content")
160
 
161
- extract_button.click(fn=extract_and_display, inputs=[url_input, supported_file_types, max_file_size], outputs=output_display)
162
 
163
  app.launch()
 
1
  import os
2
  import subprocess
3
  import gradio as gr
4
+ from magika import Magika
5
+ from huggingface_hub import login
 
 
 
6
 
7
+ # Get the HF token and space author name from environment variables
8
+ hf_token = os.getenv("HF_TOKEN")
9
+ hf_user = os.getenv("SPACE_AUTHOR_NAME")
10
+
11
+ if not hf_token:
12
+ raise ValueError("HF_TOKEN environment variable is not set")
13
+ if not hf_user:
14
+ raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
15
+
16
+ # Perform login using the token
17
+ # login(token=hf_token, add_to_git_credential=True)
18
 
 
19
  SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]
 
20
 
21
  def validate_url(url):
22
  return url.startswith('https://')
 
24
  def clone_repo(url, repo_dir, hf_token, hf_user):
25
  env = os.environ.copy()
26
  env['GIT_LFS_SKIP_SMUDGE'] = '1'
27
+ # Construct the Git URL with the token and author name for authentication
28
  token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
29
  result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
30
  if result.returncode != 0:
 
40
  }
41
 
42
  def read_file_content(file_path):
43
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
44
+ return file.read()
 
 
 
 
 
 
45
 
46
+ def validate_file_types(directory):
 
47
  m = Magika()
48
  file_types = {}
49
  for root, _, files in os.walk(directory):
50
+ if '.git' in root:
51
  continue
52
  for file_name in files:
53
  file_path = os.path.join(root, file_name)
 
55
  with open(file_path, 'rb') as file:
56
  file_bytes = file.read()
57
  result = m.identify_bytes(file_bytes)
58
+ file_types[file_path] = result.output.ct_label
 
 
 
59
  except Exception as e:
60
  file_types[file_path] = f"Error: {str(e)}"
61
  return file_types
62
 
63
+ def extract_repo_content(url, hf_token, hf_user):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  if not validate_url(url):
65
  return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
66
 
67
+ repo_dir = "./temp_repo"
68
+ if os.path.exists(repo_dir):
69
+ subprocess.run(["rm", "-rf", repo_dir])
70
+
71
  success, error = clone_repo(url, repo_dir, hf_token, hf_user)
72
  if not success:
73
  return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
74
 
75
+ file_types = validate_file_types(repo_dir)
76
+ extracted_content = []
77
+ for file_path, file_type in file_types.items():
78
+ file_summary = get_file_summary(file_path, file_type)
79
+ content = {"header": file_summary}
 
 
80
 
81
+ if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
82
+ try:
83
+ content["content"] = read_file_content(file_path)
84
+ except Exception as e:
85
+ content["content"] = f"Failed to read file content: {str(e)}"
86
+ else:
87
+ content["content"] = "File too large or binary, content not captured."
88
+
89
+ extracted_content.append(content)
90
 
91
  # Cleanup temporary directory
92
  subprocess.run(["rm", "-rf", repo_dir])
 
106
  formatted_output += "Error in file data format.\n"
107
  return formatted_output
108
 
109
+ def extract_and_display(url):
110
+ extracted_content = extract_repo_content(url, hf_token, hf_user)
 
 
 
 
 
 
 
 
111
  formatted_output = format_output(extracted_content, url)
112
  return formatted_output
113
 
114
+ app = gr.Blocks(theme="sudeepshouche/minimalist")
115
 
116
  with app:
117
  gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
 
125
  ],
126
  inputs=url_input
127
  )
128
+ output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than 32 kb\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. πŸ€–")
 
 
129
  extract_button = gr.Button("Extract Content")
130
 
131
+ extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
132
 
133
  app.launch()