dwb2023 commited on
Commit
4b78f6f
1 Parent(s): 0efb4bc

Update file_utils.py

Browse files
Files changed (1) hide show
  1. file_utils.py +44 -42
file_utils.py CHANGED
@@ -1,49 +1,51 @@
1
  import os
2
- from magika import Magika
3
- from transformers import pipeline
4
 
5
- # Initialize the summarization pipeline
6
- summarizer = pipeline("summarization")
7
 
8
- SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
 
 
 
 
 
 
 
9
 
10
- def validate_file_types(directory):
11
- m = Magika()
12
- file_types = {}
13
- for root, _, files in os.walk(directory):
14
- if '.git' in root:
15
- continue
16
- for file_name in files:
17
- file_path = os.path.join(root, file_name)
 
 
 
 
 
 
 
 
 
 
 
18
  try:
19
- with open(file_path, 'rb') as file:
20
- file_bytes = file.read()
21
- result = m.identify_bytes(file_bytes)
22
- file_types[file_path] = result.output.ct_label
23
  except Exception as e:
24
- file_types[file_path] = f"Error: {str(e)}"
25
- return file_types
26
-
27
- def get_file_summary(file_path, file_type):
28
- size = os.path.getsize(file_path)
29
- return {
30
- "name": os.path.relpath(file_path),
31
- "type": file_type,
32
- "size": size,
33
- "creation_date": os.path.getctime(file_path),
34
- "modification_date": os.path.getmtime(file_path)
35
- }
36
-
37
- def read_file_content(file_path, max_size=32*1024):
38
- with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
39
- content = file.read()
40
- if len(content) > max_size:
41
- return content[:max_size] + "\n... [Content Truncated] ..."
42
  else:
43
- return content
44
-
45
- def summarize_content(content):
46
- max_chunk_size = 1000 # max input size for the summarization model
47
- chunks = [content[i:i + max_chunk_size] for i in range(0, len(content), max_chunk_size)]
48
- summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
49
- return " ".join(summaries)
 
 
1
  import os
2
+ import subprocess
3
+ from file_utils import validate_file_types, get_file_summary, read_file_content, summarize_content
4
 
5
+ def validate_url(url):
6
+ return url.startswith('https://')
7
 
8
+ def clone_repo(url, repo_dir, hf_token, hf_user):
9
+ env = os.environ.copy()
10
+ env['GIT_LFS_SKIP_SMUDGE'] = '1'
11
+ token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
12
+ result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
13
+ if result.returncode != 0:
14
+ return False, result.stderr
15
+ return True, None
16
 
17
+ def extract_repo_content(url, hf_token, hf_user):
18
+ if not validate_url(url):
19
+ return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
20
+
21
+ repo_dir = "./temp_repo"
22
+ if os.path.exists(repo_dir):
23
+ subprocess.run(["rm", "-rf", repo_dir])
24
+
25
+ success, error = clone_repo(url, repo_dir, hf_token, hf_user)
26
+ if not success:
27
+ return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
28
+
29
+ file_types = validate_file_types(repo_dir)
30
+ extracted_content = []
31
+ for file_path, file_type in file_types.items():
32
+ file_summary = get_file_summary(file_path, file_type)
33
+ content = {"header": file_summary}
34
+
35
+ if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
36
  try:
37
+ file_content = read_file_content(file_path)
38
+ content["content"] = file_content
39
+ content["summary"] = summarize_content(file_content)
 
40
  except Exception as e:
41
+ content["content"] = f"Failed to read file content: {str(e)}"
42
+ content["summary"] = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  else:
44
+ content["content"] = "File too large or binary, content not captured."
45
+ content["summary"] = ""
46
+
47
+ extracted_content.append(content)
48
+
49
+ subprocess.run(["rm", "-rf", repo_dir])
50
+
51
+ return extracted_content