dwb2023 commited on
Commit
a7fb4a4
1 Parent(s): f8ce262

Update file_utils.py

Browse files
Files changed (1) hide show
  1. file_utils.py +19 -17
file_utils.py CHANGED
@@ -1,7 +1,22 @@
1
  import os
2
  from magika import Magika
3
 
4
- SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def get_file_summary(file_path, file_type):
7
  size = os.path.getsize(file_path)
@@ -20,19 +35,6 @@ def read_file_content(file_path, max_size=32*1024):
20
  else:
21
  return file.read()
22
 
23
- def validate_file_types(directory):
24
- m = Magika()
25
- file_types = {}
26
- for root, _, files in os.walk(directory):
27
- if '.git' in root:
28
- continue
29
- for file_name in files:
30
- file_path = os.path.join(root, file_name)
31
- try:
32
- with open(file_path, 'rb') as file:
33
- file_bytes = file.read()
34
- result = m.identify_bytes(file_bytes)
35
- file_types[file_path] = result.output.ct_label
36
- except Exception as e:
37
- file_types[file_path] = f"Error: {str(e)}"
38
- return file_types
 
1
  import os
2
  from magika import Magika
3
 
4
+ def validate_file_types(directory):
5
+ m = Magika()
6
+ file_types = {}
7
+ for root, _, files in os.walk(directory):
8
+ if '.git' in root:
9
+ continue
10
+ for file_name in files:
11
+ file_path = os.path.join(root, file_name)
12
+ try:
13
+ with open(file_path, 'rb') as file:
14
+ file_bytes = file.read()
15
+ result = m.identify_bytes(file_bytes)
16
+ file_types[file_path] = result.output.ct_label
17
+ except Exception as e:
18
+ file_types[file_path] = f"Error: {str(e)}"
19
+ return file_types
20
 
21
  def get_file_summary(file_path, file_type):
22
  size = os.path.getsize(file_path)
 
35
  else:
36
  return file.read()
37
 
38
+ def summarize_content(content):
39
+ # Implement your summarization logic here, potentially using transformers
40
+ pass