import os from magika import Magika from transformers import pipeline # Initialize the summarization pipeline summarizer = pipeline("summarization") SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"] def validate_file_types(directory): m = Magika() file_types = {} for root, _, files in os.walk(directory): if '.git' in root: continue for file_name in files: file_path = os.path.join(root, file_name) try: with open(file_path, 'rb') as file: file_bytes = file.read() result = m.identify_bytes(file_bytes) file_types[file_path] = result.output.ct_label except Exception as e: file_types[file_path] = f"Error: {str(e)}" return file_types def get_file_summary(file_path, file_type): size = os.path.getsize(file_path) return { "name": os.path.relpath(file_path), "type": file_type, "size": size, "creation_date": os.path.getctime(file_path), "modification_date": os.path.getmtime(file_path) } def read_file_content(file_path, max_size=32*1024): with open(file_path, "r", encoding="utf-8", errors="ignore") as file: content = file.read() if len(content) > max_size: return content[:max_size] + "\n... [Content Truncated] ..." else: return content def summarize_content(content): max_chunk_size = 1000 # max input size for the summarization model chunks = [content[i:i + max_chunk_size] for i in range(0, len(content), max_chunk_size)] summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks] return " ".join(summaries)