Spaces:

dwb2023
/

hf_extractor

Running on Zero

App Files Files Community

hf_extractor / app.py

dwb2023

Update app.py

e33200c verified 6 months ago

raw

history blame

3.95 kB

	import os
	import subprocess
	import gradio as gr
	from magika import Magika

	SUPPORTED_FILE_TYPES = ["txt", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html"]

	def validate_url(url):
	return url.startswith('https://')

	def clone_repo(url, repo_dir):
	env = os.environ.copy()
	env['GIT_LFS_SKIP_SMUDGE'] = '1'
	result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
	if result.returncode != 0:
	return False, result.stderr
	return True, None

	def get_file_summary(file_path, file_type):
	size = os.path.getsize(file_path)
	return {
	"name": os.path.relpath(file_path),
	"type": file_type,
	"size": size,
	}

	def read_file_content(file_path):
	with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
	return file.read()

	def validate_file_types(directory):
	m = Magika()
	file_types = {}
	for root, _, files in os.walk(directory):
	if '.git' in root:
	continue
	for file_name in files:
	file_path = os.path.join(root, file_name)
	try:
	with open(file_path, 'rb') as file:
	file_bytes = file.read()
	result = m.identify_bytes(file_bytes)
	file_types[file_path] = result.output.ct_label
	except Exception as e:
	file_types[file_path] = f"Error: {str(e)}"
	return file_types

	def extract_repo_content(url):
	if not validate_url(url):
	return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]

	repo_dir = "./temp_repo"
	if os.path.exists(repo_dir):
	subprocess.run(["rm", "-rf", repo_dir])

	success, error = clone_repo(url, repo_dir)
	if not success:
	return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]

	file_types = validate_file_types(repo_dir)
	extracted_content = []
	for file_path, file_type in file_types.items():
	file_summary = get_file_summary(file_path, file_type)
	content = {"header": file_summary}

	if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 1024 * 1024:
	try:
	content["content"] = read_file_content(file_path)
	except Exception as e:
	content["content"] = f"Failed to read file content: {str(e)}"
	else:
	content["content"] = "File too large or binary, content not captured."

	extracted_content.append(content)

	# Cleanup temporary directory
	subprocess.run(["rm", "-rf", repo_dir])

	return extracted_content

	def format_output(extracted_content):
	formatted_output = ""
	for file_data in extracted_content:
	if isinstance(file_data, dict) and 'header' in file_data:
	formatted_output += f"### File: {file_data['header']['name']}\n"
	formatted_output += f"Type: {file_data['header']['type']}\n"
	formatted_output += f"Size: {file_data['header']['size']} bytes\n"
	formatted_output += "#### Content:\n"
	formatted_output += f"```\n{file_data['content']}\n```\n\n"
	else:
	formatted_output += "Error in file data format.\n"
	return formatted_output

	def extract_and_display(url):
	extracted_content = extract_repo_content(url)
	formatted_output = format_output(extracted_content)
	return formatted_output

	app = gr.Blocks()

	with app:
	gr.Markdown("# Gradio Space/Model Content Extractor")
	url_input = gr.Textbox(label="Hugging Face Space/Model URL")
	output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
	extract_button = gr.Button("Extract Content")

	extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

	app.launch()