Spaces:

dwb2023
/

hf_extractor

Running on Zero

App Files Files Community

hf_extractor / app.py

dwb2023

Update app.py

56dbc6b verified 5 months ago

raw

history blame

5.7 kB

	import os
	import subprocess
	import gradio as gr
	from magika import Magika
	from huggingface_hub import login

	# Get the HF token and space author name from environment variables
	hf_token = os.getenv("HF_TOKEN")
	hf_user = os.getenv("SPACE_AUTHOR_NAME")

	if not hf_token:
	raise ValueError("HF_TOKEN environment variable is not set")
	if not hf_user:
	raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")

	SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]

	def validate_url(url):
	return url.startswith('https://')

	def clone_repo(url, repo_dir, hf_token, hf_user):
	env = os.environ.copy()
	env['GIT_LFS_SKIP_SMUDGE'] = '1'
	token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
	result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
	if result.returncode != 0:
	return False, result.stderr
	return True, None

	def get_file_summary(file_path, file_type):
	size = os.path.getsize(file_path)
	return {
	"name": os.path.relpath(file_path),
	"type": file_type,
	"size": size,
	"creation_date": os.path.getctime(file_path),
	"modification_date": os.path.getmtime(file_path)
	}

	def read_file_content(file_path, max_size=32*1024):
	with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
	if os.path.getsize(file_path) > max_size:
	return file.read(max_size) + "\n... [Content Truncated] ..."
	else:
	return file.read()

	def validate_file_types(directory):
	m = Magika()
	file_types = {}
	for root, _, files in os.walk(directory):
	if '.git' in root:
	continue
	for file_name in files:
	file_path = os.path.join(root, file_name)
	try:
	with open(file_path, 'rb') as file:
	file_bytes = file.read()
	result = m.identify_bytes(file_bytes)
	file_types[file_path] = result.output.ct_label
	except Exception as e:
	file_types[file_path] = f"Error: {str(e)}"
	return file_types

	def extract_repo_content(url, hf_token, hf_user):
	if not validate_url(url):
	return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]

	repo_dir = "./temp_repo"
	if os.path.exists(repo_dir):
	subprocess.run(["rm", "-rf", repo_dir])

	success, error = clone_repo(url, repo_dir, hf_token, hf_user)
	if not success:
	return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]

	file_types = validate_file_types(repo_dir)
	extracted_content = []
	for file_path, file_type in file_types.items():
	file_summary = get_file_summary(file_path, file_type)
	content = {"header": file_summary}

	if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
	try:
	content["content"] = read_file_content(file_path)
	except Exception as e:
	content["content"] = f"Failed to read file content: {str(e)}"
	else:
	content["content"] = "File too large or binary, content not captured."

	extracted_content.append(content)

	subprocess.run(["rm", "-rf", repo_dir])

	return extracted_content

	def format_output(extracted_content, repo_url):
	formatted_output = f"# Repository URL: {repo_url}\n\n"
	for file_data in extracted_content:
	if isinstance(file_data, dict) and 'header' in file_data:
	formatted_output += f"### File: {file_data['header']['name']}\n"
	formatted_output += f"Type: {file_data['header']['type']}\n"
	formatted_output += f"Size: {file_data['header']['size']} bytes\n"
	formatted_output += f"Created: {file_data['header']['creation_date']}\n"
	formatted_output += f"Modified: {file_data['header']['modification_date']}\n"
	formatted_output += "#### Content:\n"
	formatted_output += f"```\n{file_data['content']}\n```\n\n"
	else:
	formatted_output += "Error in file data format.\n"
	return formatted_output

	def extract_and_display(url):
	extracted_content = extract_repo_content(url, hf_token, hf_user)
	formatted_output = format_output(extracted_content, url)
	return formatted_output

	app = gr.Blocks(theme="sudeepshouche/minimalist")

	with app:
	gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
	url_input = gr.Textbox(label="https:// URL of Repository", placeholder="Enter the repository URL here OR select an example below...")
	url_examples = gr.Examples(
	examples=[
	["https://huggingface.co/spaces/big-vision/paligemma-hf"],
	["https://huggingface.co/google/paligemma-3b-mix-224"],
	["https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"],
	["https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"]
	],
	inputs=url_input
	)
	output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than 32 kb\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. 🤖")
	extract_button = gr.Button("Extract Content")

	extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

	app.launch()