import os
from magika import Magika
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline("summarization")

SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]

def validate_file_types(directory):
    m = Magika()
    file_types = {}
    for root, _, files in os.walk(directory):
        if '.git' in root:
            continue
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'rb') as file:
                    file_bytes = file.read()
                result = m.identify_bytes(file_bytes)
                file_types[file_path] = result.output.ct_label
            except Exception as e:
                file_types[file_path] = f"Error: {str(e)}"
    return file_types

def get_file_summary(file_path, file_type):
    size = os.path.getsize(file_path)
    return {
        "name": os.path.relpath(file_path),
        "type": file_type,
        "size": size,
        "creation_date": os.path.getctime(file_path),
        "modification_date": os.path.getmtime(file_path)
    }

def read_file_content(file_path, max_size=32*1024):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        content = file.read()
        if len(content) > max_size:
            return content[:max_size] + "\n... [Content Truncated] ..."
        else:
            return content

def summarize_content(content):
    max_chunk_size = 1000  # max input size for the summarization model
    chunks = [content[i:i + max_chunk_size] for i in range(0, len(content), max_chunk_size)]
    summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
    return " ".join(summaries)