Spaces:

dwb2023
/

hf_extractor

Running on Zero

App Files Files Community

dwb2023 commited on Jun 15, 2024

Commit

86f4186

verified ·

1 Parent(s): 72394b9

Update file_utils.py

Browse files

Files changed (1) hide show

file_utils.py +49 -0

file_utils.py CHANGED Viewed

	@@ -0,0 +1,49 @@

+import os
+from magika import Magika
+from transformers import pipeline
+# Initialize the summarization pipeline
+summarizer = pipeline("summarization")
+SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
+def validate_file_types(directory):
+    m = Magika()
+    file_types = {}
+    for root, _, files in os.walk(directory):
+        if '.git' in root:
+            continue
+        for file_name in files:
+            file_path = os.path.join(root, file_name)
+            try:
+                with open(file_path, 'rb') as file:
+                    file_bytes = file.read()
+                result = m.identify_bytes(file_bytes)
+                file_types[file_path] = result.output.ct_label
+            except Exception as e:
+                file_types[file_path] = f"Error: {str(e)}"
+    return file_types
+def get_file_summary(file_path, file_type):
+    size = os.path.getsize(file_path)
+    return {
+        "name": os.path.relpath(file_path),
+        "type": file_type,
+        "size": size,
+        "creation_date": os.path.getctime(file_path),
+        "modification_date": os.path.getmtime(file_path)
+    }
+def read_file_content(file_path, max_size=32*1024):
+    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
+        content = file.read()
+        if len(content) > max_size:
+            return content[:max_size] + "\n... [Content Truncated] ..."
+        else:
+            return content
+def summarize_content(content):
+    max_chunk_size = 1000  # max input size for the summarization model
+    chunks = [content[i:i + max_chunk_size] for i in range(0, len(content), max_chunk_size)]
+    summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
+    return " ".join(summaries)