Spaces:

dwb2023
/

hf_extractor

Running on Zero

App Files Files Community

dwb2023 commited on Jun 15, 2024

Commit

56dbc6b

verified ·

1 Parent(s): fb7f43d

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -9

app.py CHANGED Viewed

@@ -13,10 +13,7 @@ if not hf_token:
 if not hf_user:
     raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
-# Perform login using the token
-# login(token=hf_token, add_to_git_credential=True)
-SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini"]
 def validate_url(url):
     return url.startswith('https://')
@@ -24,7 +21,6 @@ def validate_url(url):
 def clone_repo(url, repo_dir, hf_token, hf_user):
     env = os.environ.copy()
     env['GIT_LFS_SKIP_SMUDGE'] = '1'
-    # Construct the Git URL with the token and author name for authentication
     token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
     result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
     if result.returncode != 0:
@@ -37,11 +33,16 @@ def get_file_summary(file_path, file_type):
         "name": os.path.relpath(file_path),
         "type": file_type,
         "size": size,
     }
-def read_file_content(file_path):
     with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
-        return file.read()
 def validate_file_types(directory):
     m = Magika()
@@ -88,7 +89,6 @@ def extract_repo_content(url, hf_token, hf_user):
         extracted_content.append(content)
-    # Cleanup temporary directory
     subprocess.run(["rm", "-rf", repo_dir])
     return extracted_content
@@ -100,6 +100,8 @@ def format_output(extracted_content, repo_url):
             formatted_output += f"### File: {file_data['header']['name']}\n"
             formatted_output += f"**Type:** {file_data['header']['type']}\n"
             formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
             formatted_output += "#### Content:\n"
             formatted_output += f"```\n{file_data['content']}\n```\n\n"
         else:
@@ -130,4 +132,4 @@ with app:
     extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
-app.launch()

 if not hf_user:
     raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
+SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
 def validate_url(url):
     return url.startswith('https://')
 def clone_repo(url, repo_dir, hf_token, hf_user):
     env = os.environ.copy()
     env['GIT_LFS_SKIP_SMUDGE'] = '1'
     token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
     result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
     if result.returncode != 0:
         "name": os.path.relpath(file_path),
         "type": file_type,
         "size": size,
+        "creation_date": os.path.getctime(file_path),
+        "modification_date": os.path.getmtime(file_path)
     }
+def read_file_content(file_path, max_size=32*1024):
     with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
+        if os.path.getsize(file_path) > max_size:
+            return file.read(max_size) + "\n... [Content Truncated] ..."
+        else:
+            return file.read()
 def validate_file_types(directory):
     m = Magika()
         extracted_content.append(content)
     subprocess.run(["rm", "-rf", repo_dir])
     return extracted_content
             formatted_output += f"### File: {file_data['header']['name']}\n"
             formatted_output += f"**Type:** {file_data['header']['type']}\n"
             formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
+            formatted_output += f"**Created:** {file_data['header']['creation_date']}\n"
+            formatted_output += f"**Modified:** {file_data['header']['modification_date']}\n"
             formatted_output += "#### Content:\n"
             formatted_output += f"```\n{file_data['content']}\n```\n\n"
         else:
     extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
+app.launch()