import sys, copy
import os, random, numpy as np, socket

import json
import tqdm
from multiprocessing import Pool
import glob, os, fire
from collections import defaultdict
sys.path.insert(0, "../../")
from data.tokenizer import TextTokenizer, tokenize_text

def write_jsonl(data, fn):
    with open(fn, "w") as file:
        for entry in data:
            file.write(json.dumps(entry, ensure_ascii=False) + "\n")


def read_jsonl(file_path):
    cur_data = []
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        for line in file:
            cur_data.append(json.loads(line.strip()))
    return cur_data


def phonemize_and_save(text, fn, text_tokenizer):
    """Phonemizes the text and saves the result to a file."""
    phn = tokenize_text(text_tokenizer, text)
    os.makedirs(os.path.dirname(fn), exist_ok=True)
    with open(fn, "w") as f:
        f.write(" ".join(phn))
    return set(phn)


def process_item(item, root, sub_root, audio_folder, phn_folder, audio_ext, text_ext, phn_ext, text_tokenizer):
    """Worker function to process a single item."""
    text_path = os.path.join(root, sub_root, audio_folder, item[0].replace(audio_ext, text_ext))
    if not os.path.exists(text_path):
        return {"missing_text": text_path, "success": False, "cur_phn_set": set()}

    with open(text_path, "r") as f:
        text = [line.strip() for line in f.readlines()]
        text = " ".join(text)

    phn_path = os.path.join(root, sub_root, phn_folder, item[0].replace(audio_ext, phn_ext))
    cur_phn_set = phonemize_and_save(text, phn_path, text_tokenizer)
    return {"missing_text": None, "success": True, "cur_phn_set": cur_phn_set}


def process_item_star(args):
    """Unpacks arguments for `process_item` to work with `imap`."""
    return process_item(*args)

def main(
    root="/data/scratch/pyp/datasets/emilia",
    sub_root="preprocessed",
    manifest_folder="manifest_for_codec",
    audio_folder="audio",
    phn_folder="phoneme",
    audio_ext=".mp3",
    text_ext=".txt",
    phn_ext=".txt",
    num_workers=8,
):
    """Main function to process phoneme generation in parallel."""
    # # Initialize the tokenizer
    text_tokenizer = TextTokenizer()
    all_fns = glob.glob(f"{root}/{sub_root}/{manifest_folder}/*.txt")
    print(f"found {len(all_fns)} manifest files")
    print(f"{all_fns[:3]=}")

    data = []
    for fn in all_fns:
        with open(fn, "r") as f:
            data += [line.strip().split("\t") for line in f]
    
    vocab = set()

    ################## parallel processing ##################
    ################## parallel processing ##################
    ################## parallel processing ##################
    # Prepare arguments for the worker function
    # tasks = [
    #     (
    #         item,
    #         root,
    #         sub_root,
    #         audio_folder,
    #         phn_folder,
    #         audio_ext,
    #         text_ext,
    #         phn_ext,
    #         text_tokenizer,
    #     )
    #     for item in data
    # ]

    # # Parallel processing with progress monitoring
    # results = []
    # with Pool(num_workers) as pool:
    #     for result in tqdm.tqdm(
    #         pool.imap_unordered(process_item_star, tasks),
    #         total=len(tasks),
    #         desc="Processing items",
    #     ):
    #         results.append(result)
    # # read all manifest endswith .txt
    # missing_text = [result["missing_text"] for result in results if not result["success"]]
    # for result in results:
    #     if result['success']:
    #         vocab.update(result['cur_phn_set'])
    ################## parallel processing ##################
    ################## parallel processing ##################
    ################## parallel processing ##################

    ################## sequential processing ##################
    ################## sequential processing ##################
    ################## sequential processing ##################
    missing_text = []
    for item in tqdm.tqdm(data):
        text_path = os.path.join(root, sub_root, audio_folder, item[0].replace(audio_ext, text_ext))
        if not os.path.exists(text_path):
            missing_text.append(text_path)
            continue
        try:
            with open(text_path, "r") as f:
                text = [line.strip() for line in f.readlines()]
                text = " ".join(text)
        except:
            print(f"Error reading {text_path}")
            continue
        cur_phn_set = phonemize_and_save(text, os.path.join(root, sub_root, phn_folder, item[0].replace(audio_ext, phn_ext)), text_tokenizer)
        vocab.update(cur_phn_set)
    ################## sequential processing ##################
    ################## sequential processing ##################
    ################## sequential processing ##################

    # save the vocab
    vocab = list(vocab)
    # sort the vocab
    vocab.sort()
    with open(os.path.join(root, sub_root, "vocab.txt"), "w") as f:
        f.write("\n".join(vocab))

    # Collect missing text paths
    print(f"Missing text files: {len(missing_text)}")
    if missing_text:
        print("Some missing files:", missing_text[:10])  # Print the first 10 missing files as an example
    

if __name__ == "__main__":
    fire.Fire(main)