Spaces:
Running
on
Zero
Running
on
Zero
Update file_utils.py
Browse files- file_utils.py +38 -44
file_utils.py
CHANGED
@@ -1,49 +1,43 @@
|
|
1 |
import os
|
2 |
-
|
3 |
-
|
4 |
|
5 |
-
def
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
hf_token = os.getenv("HF_TOKEN")
|
22 |
-
hf_user = os.getenv("SPACE_AUTHOR_NAME")
|
23 |
-
if not hf_token or not hf_user:
|
24 |
-
return "Error: HF_TOKEN or SPACE_AUTHOR_NAME environment variable is not set."
|
25 |
-
extracted_content = extract_repo_content(url, hf_token, hf_user)
|
26 |
-
formatted_output = format_output(extracted_content, url)
|
27 |
-
return formatted_output
|
28 |
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
["https://huggingface.co/google/paligemma-3b-mix-224"],
|
38 |
-
["https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"],
|
39 |
-
["https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"]
|
40 |
-
],
|
41 |
-
inputs=url_input
|
42 |
-
)
|
43 |
-
output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than 32 kb\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. 🤖")
|
44 |
-
extract_button = gr.Button("Extract Content")
|
45 |
-
|
46 |
-
extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
|
47 |
|
48 |
-
|
49 |
-
|
|
|
|
1 |
import os
|
2 |
+
from magika import Magika
|
3 |
+
import datetime
|
4 |
|
5 |
+
def validate_file_types(directory):
|
6 |
+
m = Magika()
|
7 |
+
file_types = {}
|
8 |
+
for root, _, files in os.walk(directory):
|
9 |
+
if '.git' in root:
|
10 |
+
continue
|
11 |
+
for file_name in files:
|
12 |
+
file_path = os.path.join(root, file_name)
|
13 |
+
try:
|
14 |
+
with open(file_path, 'rb') as file:
|
15 |
+
file_bytes = file.read()
|
16 |
+
result = m.identify_bytes(file_bytes)
|
17 |
+
file_types[file_path] = result.output.ct_label
|
18 |
+
except Exception as e:
|
19 |
+
file_types[file_path] = f"Error: {str(e)}"
|
20 |
+
return file_types
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
def get_file_summary(file_path, file_type):
|
23 |
+
size = os.path.getsize(file_path)
|
24 |
+
creation_date = datetime.datetime.utcfromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%d %H:%M:%S UTC')
|
25 |
+
modification_date = datetime.datetime.utcfromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S UTC')
|
26 |
+
return {
|
27 |
+
"name": os.path.relpath(file_path),
|
28 |
+
"type": file_type,
|
29 |
+
"size": size,
|
30 |
+
"creation_date": creation_date,
|
31 |
+
"modification_date": modification_date
|
32 |
+
}
|
33 |
|
34 |
+
def read_file_content(file_path, max_size=32*1024):
|
35 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
36 |
+
if os.path.getsize(file_path) > max_size:
|
37 |
+
return file.read(max_size) + "\n... [Content Truncated] ..."
|
38 |
+
else:
|
39 |
+
return file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
def summarize_content(content):
|
42 |
+
# Placeholder for summarization logic
|
43 |
+
pass
|