Spaces:

dwb2023
/

hf_extractor

Running on Zero

App Files Files Community

dwb2023 commited on Jun 15, 2024

Commit

4b78f6f

verified ·

1 Parent(s): 0efb4bc

Update file_utils.py

Browse files

Files changed (1) hide show

file_utils.py +44 -42

file_utils.py CHANGED Viewed

@@ -1,49 +1,51 @@
 import os
-from magika import Magika
-from transformers import pipeline
-# Initialize the summarization pipeline
-summarizer = pipeline("summarization")
-SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
-def validate_file_types(directory):
-    m = Magika()
-    file_types = {}
-    for root, _, files in os.walk(directory):
-        if '.git' in root:
-            continue
-        for file_name in files:
-            file_path = os.path.join(root, file_name)
             try:
-                with open(file_path, 'rb') as file:
-                    file_bytes = file.read()
-                result = m.identify_bytes(file_bytes)
-                file_types[file_path] = result.output.ct_label
             except Exception as e:
-                file_types[file_path] = f"Error: {str(e)}"
-    return file_types
-def get_file_summary(file_path, file_type):
-    size = os.path.getsize(file_path)
-    return {
-        "name": os.path.relpath(file_path),
-        "type": file_type,
-        "size": size,
-        "creation_date": os.path.getctime(file_path),
-        "modification_date": os.path.getmtime(file_path)
-    }
-def read_file_content(file_path, max_size=32*1024):
-    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
-        content = file.read()
-        if len(content) > max_size:
-            return content[:max_size] + "\n... [Content Truncated] ..."
         else:
-            return content
-def summarize_content(content):
-    max_chunk_size = 1000  # max input size for the summarization model
-    chunks = [content[i:i + max_chunk_size] for i in range(0, len(content), max_chunk_size)]
-    summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
-    return " ".join(summaries)

 import os
+import subprocess
+from file_utils import validate_file_types, get_file_summary, read_file_content, summarize_content
+def validate_url(url):
+    return url.startswith('https://')
+def clone_repo(url, repo_dir, hf_token, hf_user):
+    env = os.environ.copy()
+    env['GIT_LFS_SKIP_SMUDGE'] = '1'
+    token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
+    result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
+    if result.returncode != 0:
+        return False, result.stderr
+    return True, None
+def extract_repo_content(url, hf_token, hf_user):
+    if not validate_url(url):
+        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
+    repo_dir = "./temp_repo"
+    if os.path.exists(repo_dir):
+        subprocess.run(["rm", "-rf", repo_dir])
+    success, error = clone_repo(url, repo_dir, hf_token, hf_user)
+    if not success:
+        return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
+    file_types = validate_file_types(repo_dir)
+    extracted_content = []
+    for file_path, file_type in file_types.items():
+        file_summary = get_file_summary(file_path, file_type)
+        content = {"header": file_summary}
+        if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
             try:
+                file_content = read_file_content(file_path)
+                content["content"] = file_content
+                content["summary"] = summarize_content(file_content)
             except Exception as e:
+                content["content"] = f"Failed to read file content: {str(e)}"
+                content["summary"] = ""
         else:
+            content["content"] = "File too large or binary, content not captured."
+            content["summary"] = ""
+        extracted_content.append(content)
+    subprocess.run(["rm", "-rf", repo_dir])
+    return extracted_content