import os
import re
from typing import List, Tuple, Optional, Dict
import logging
import threading
# from . import to_translate
import datetime
from . import common, to_translate
import time
import fitz  # PyMuPDF
import shapely.geometry as sg
from shapely.geometry.base import BaseGeometry
from shapely.validation import explain_validity
import markdown
import pdfkit
import codecs
# from weasyprint import HTML
from pymdownx import superfences
from bs4 import BeautifulSoup
from PIL import Image

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# This Default Prompt Using Chinese and could be changed to other languages.

DEFAULT_PROMPT = """使用markdown语法，将图片中识别到的文字转换为markdown格式输出。你必须做到：
1. 输出和使用识别到的图片的相同的语言，例如，识别到英语的字段，输出的内容必须是英语。
2. 不要解释和输出无关的文字，直接输出图片中的内容。例如，严禁输出 “以下是我根据图片内容生成的markdown文本：”这样的例子，而是应该直接输出markdown。
3. 内容不要包含在```markdown ```中、段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式、忽略掉长直线、忽略掉页码。
再次强调，不要解释和输出无关的文字，直接输出图片中的内容。
"""
DEFAULT_RECT_PROMPT = """图片中用红色框和名称(%s)标注出了一些区域。如果区域是表格或者图片，使用 ![]() 的形式插入到输出内容中，否则直接输出文字内容。
"""
DEFAULT_ROLE_PROMPT = """你是一个PDF文档解析器，使用markdown和latex语法输出图片的内容。
"""


def _is_near(rect1: BaseGeometry, rect2: BaseGeometry, distance: float = 20) -> bool:
    """
    Check if two rectangles are near each other if the distance between them is less than the target.
    """
    return rect1.buffer(0.1).distance(rect2.buffer(0.1)) < distance


def _is_horizontal_near(rect1: BaseGeometry, rect2: BaseGeometry, distance: float = 100) -> bool:
    """
    Check if two rectangles are near horizontally if one of them is a horizontal line.
    """
    result = False
    if abs(rect1.bounds[3] - rect1.bounds[1]) < 0.1 or abs(rect2.bounds[3] - rect2.bounds[1]) < 0.1:
        if abs(rect1.bounds[0] - rect2.bounds[0]) < 0.1 and abs(rect1.bounds[2] - rect2.bounds[2]) < 0.1:
            result = abs(rect1.bounds[3] - rect2.bounds[3]) < distance
    return result


def _union_rects(rect1: BaseGeometry, rect2: BaseGeometry) -> BaseGeometry:
    """
    Union two rectangles.
    """
    return sg.box(*(rect1.union(rect2).bounds))


def _merge_rects(rect_list: List[BaseGeometry], distance: float = 20, horizontal_distance: Optional[float] = None) -> \
        List[BaseGeometry]:
    """
    Merge rectangles in the list if the distance between them is less than the target.
    """
    merged = True
    while merged:
        merged = False
        new_rect_list = []
        while rect_list:
            rect = rect_list.pop(0)
            for other_rect in rect_list:
                if _is_near(rect, other_rect, distance) or (
                        horizontal_distance and _is_horizontal_near(rect, other_rect, horizontal_distance)):
                    rect = _union_rects(rect, other_rect)
                    rect_list.remove(other_rect)
                    merged = True
            new_rect_list.append(rect)
        rect_list = new_rect_list
    return rect_list


def _adsorb_rects_to_rects(source_rects: List[BaseGeometry], target_rects: List[BaseGeometry], distance: float = 10) -> \
        Tuple[List[BaseGeometry], List[BaseGeometry]]:
    """
    Adsorb a set of rectangles to another set of rectangles.
    """
    new_source_rects = []
    for text_area_rect in source_rects:
        adsorbed = False
        for index, rect in enumerate(target_rects):
            if _is_near(text_area_rect, rect, distance):
                rect = _union_rects(text_area_rect, rect)
                target_rects[index] = rect
                adsorbed = True
                break
        if not adsorbed:
            new_source_rects.append(text_area_rect)
    return new_source_rects, target_rects


def _parse_rects(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
    """
    Parse drawings in the page and merge adjacent rectangles.
    """

    # 提取画的内容
    drawings = page.get_drawings()

    # 忽略掉长度小于30的水平直线
    is_short_line = lambda x: abs(x['rect'][3] - x['rect'][1]) < 1 and abs(x['rect'][2] - x['rect'][0]) < 30
    drawings = [drawing for drawing in drawings if not is_short_line(drawing)]

    # 转换为shapely的矩形
    rect_list = [sg.box(*drawing['rect']) for drawing in drawings]

    # 提取图片区域
    images = page.get_image_info()
    image_rects = [sg.box(*image['bbox']) for image in images]

    # 合并drawings和images
    rect_list += image_rects

    merged_rects = _merge_rects(rect_list, distance=10, horizontal_distance=100)
    merged_rects = [rect for rect in merged_rects if explain_validity(rect) == 'Valid Geometry']

    # 将大文本区域和小文本区域分开处理: 大文本相小合并，小文本靠近合并
    is_large_content = lambda x: (len(x[4]) / max(1, len(x[4].split('\n')))) > 5
    small_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if not is_large_content(x)]
    large_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if is_large_content(x)]
    _, merged_rects = _adsorb_rects_to_rects(large_text_area_rects, merged_rects, distance=0.1) # 完全相交
    _, merged_rects = _adsorb_rects_to_rects(small_text_area_rects, merged_rects, distance=5) # 靠近

    # 再次自身合并
    merged_rects = _merge_rects(merged_rects, distance=10)

    # 过滤比较小的矩形
    merged_rects = [rect for rect in merged_rects if rect.bounds[2] - rect.bounds[0] > 20 and rect.bounds[3] - rect.bounds[1] > 20]

    return [rect.bounds for rect in merged_rects]


def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[str, List[str]]]:
    """
    Parse PDF to images and save to output_dir.
    """
    # 打开PDF文件
    pdf_document = fitz.open(pdf_path)
    image_infos = []

    for page_index, page in enumerate(pdf_document):
        logging.info(f'parse page: {page_index}')
        rect_images = []
        rects = _parse_rects(page)
        for index, rect in enumerate(rects):
            fitz_rect = fitz.Rect(rect)
            # 保存页面为图片
            pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(4, 4))
            name = f'{page_index}_{index}.png'
            pix.save(os.path.join(output_dir, name))
            rect_images.append(name)
            # # 在页面上绘制红色矩形
            big_fitz_rect = fitz.Rect(fitz_rect.x0 - 1, fitz_rect.y0 - 1, fitz_rect.x1 + 1, fitz_rect.y1 + 1)
            # 空心矩形
            page.draw_rect(big_fitz_rect, color=(1, 0, 0), width=1)
            # 画矩形区域(实心)
            # page.draw_rect(big_fitz_rect, color=(1, 0, 0), fill=(1, 0, 0))
            # 在矩形内的左上角写上矩形的索引name，添加一些偏移量
            text_x = fitz_rect.x0 + 2
            text_y = fitz_rect.y0 + 10
            text_rect = fitz.Rect(text_x, text_y - 9, text_x + 80, text_y + 2)
            # 绘制白色背景矩形
            page.draw_rect(text_rect, color=(1, 1, 1), fill=(1, 1, 1))
            # 插入带有白色背景的文字
            page.insert_text((text_x, text_y), name, fontsize=10, color=(1, 0, 0))
        page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(3, 3))
        page_image = os.path.join(output_dir, f'{page_index}.png')
        page_compress_image = os.path.join(output_dir, f'{page_index}-compress.png')
        page_image_with_rects.save(page_image)
        compress_image(page_image,page_compress_image)
        # image_infos.append((page_image, rect_images))
        image_infos.append({'text': page_image,'type':'pdf_img', 'complete': False, 'content': ''})

    pdf_document.close()
    return image_infos


def _gpt_parse_images(
        image_infos: List[Tuple[str, List[str]]],
        prompt_dict: Optional[Dict] = None,
        **args
) -> str:
    """
    Parse images to markdown content.
    """
    if isinstance(prompt_dict, dict) and 'prompt' in prompt_dict:
        prompt = prompt_dict['prompt']
        logging.info("prompt is provided, using user prompt.")
    else:
        prompt = DEFAULT_PROMPT
        logging.info("prompt is not provided, using default prompt.")
    if isinstance(prompt_dict, dict) and 'rect_prompt' in prompt_dict:
        rect_prompt = prompt_dict['rect_prompt']
        logging.info("rect_prompt is provided, using user prompt.")
    else:
        rect_prompt = DEFAULT_RECT_PROMPT
        logging.info("rect_prompt is not provided, using default prompt.")
    if isinstance(prompt_dict, dict) and 'role_prompt' in prompt_dict:
        role_prompt = prompt_dict['role_prompt']
        logging.info("role_prompt is provided, using user prompt.")
    else:
        role_prompt = DEFAULT_ROLE_PROMPT
        logging.info("role_prompt is not provided, using default prompt.")

    for image_index,image_info in enumerate(image_infos):
        user_prompt = prompt
        # if rect_images:
        #     user_prompt += rect_prompt + ', '.join(rect_images)
        image_infos[image_index]['user_prompt']=user_prompt



    # output_path = os.path.join(output_dir, 'output.md')
    # with open(output_path, 'w', encoding='utf-8') as f:
    #     f.write('\n\n'.join(contents))

    # return '\n\n'.join(contents)

def start(trans):
    # 从 trans 中获取文件路径和输出目录
    pdf_path = trans['file_path']
    output_dir = trans['target_path_dir']

    # 允许的最大线程
    threads = trans.get('threads', 10)
    max_threads = max(1, int(threads))

    # 当前执行的索引位置
    run_index = 0
    start_time = datetime.datetime.now()

    # 解析 PDF 文件
    image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir)

    _gpt_parse_images(
        image_infos=image_infos,
        prompt_dict=None,
    )

    trans['role_prompt']=DEFAULT_ROLE_PROMPT

    # 使用 threading 方式处理
    max_run = min(max_threads, len(image_infos))
    before_active_count = threading.activeCount()
    event = threading.Event()

    while run_index <= len(image_infos) - 1:
        if threading.activeCount() < max_run + before_active_count:
            if not event.is_set():
                thread = threading.Thread(target=to_translate.get, args=(trans, event, image_infos, run_index))
                thread.start()
                run_index += 1
            else:
                return False

    while True:
        complete = True
        for image_info in image_infos:
            if not image_info['complete']:
                complete = False
        if complete:
            break
        else:
            time.sleep(1)

    # print(image_infos)
    # 处理完成后，写入结果
    try:
        # c = canvas.Canvas(trans['target_file'], pagesize=letter)
        # text = c.beginText(40, 750)  # 设置文本开始的位置
        # text.setFont("Helvetica", 12)  # 设置字体和大小
        md_file = os.path.join(output_dir, 'output.md')
        with open(md_file, 'w', encoding='utf-8') as file:
            for image_info in image_infos:
            # text.textLine(image_info['text'])  # 添加文本行
            # text.textLine("")  # 添加空行作为分隔
            # write_pdf(c, image_info['text']);
                file.write(image_info['text'] + '\n')
        # write_to_pdf(md_file, trans['target_file'])
        html_to_pdf(output_dir, md_file, trans['target_file'])
        # c.save()  # 保存 PDF 文件
    except Exception as e:
        print(f"生成pdf失败： {md_file}: {e}")
        return False

    end_time = datetime.datetime.now()
    spend_time = common.display_spend(start_time, end_time)
    # translate.complete(trans, len(image_infos), spend_time)
    return True

def compress_image(image_file,compress_image_file):
    img=Image.open(image_file)
    img_resized=img.resize((img.width//2, img.height//2), resample=Image.Resampling.NEAREST)
    img_resized.save(compress_image_file,quality=30)


def html_to_pdf(output_dir, md_file, pdf_file):
    extensions = [
        'toc',  # 目录，[toc]
        'extra',  # 缩写词、属性列表、释义列表、围栏式代码块、脚注、在HTML的Markdown、表格
    ]
    third_party_extensions = [
        'mdx_math',  # KaTeX数学公式，$E=mc^2$和$$E=mc^2$$
        'markdown_checklist.extension',  # checklist，- [ ]和- [x]
        'pymdownx.magiclink',  # 自动转超链接，
        'pymdownx.caret',  # 上标下标，
        'pymdownx.superfences',  # 多种块功能允许嵌套，各种图表
        'pymdownx.betterem',  # 改善强调的处理(粗体和斜体)
        'pymdownx.mark',  # 亮色突出文本
        'pymdownx.highlight',  # 高亮显示代码
        'pymdownx.tasklist',  # 任务列表
        'pymdownx.tilde',  # 删除线
    ]
    extensions.extend(third_party_extensions)
    extension_configs = {
        'mdx_math': {
            'enable_dollar_delimiter': True  # 允许单个$
        },
        'pymdownx.superfences': {
            "custom_fences": [
                {
                    'name': 'mermaid',  # 开启流程图等图
                    'class': 'mermaid',
                    'format': superfences.fence_div_format
                }
            ]
        },
        'pymdownx.highlight': {
            'linenums': True,  # 显示行号
            'linenums_style': 'pymdownx-inline'  # 代码和行号分开
        },
        'pymdownx.tasklist': {
            'clickable_checkbox': True,  # 任务列表可点击
        }
    }
    with codecs.open(md_file, "r", encoding="utf-8") as f:
        md_content = f.read()
    
    html_file = os.path.join(output_dir, 'output.html')
    html_final_file = os.path.join(output_dir, 'output-final.html')
    html_content = markdown.markdown(md_content, extensions=extensions, extension_configs=extension_configs)
    with codecs.open(html_file, "w", encoding="utf-8") as f:
        # 加入文件头防止中文乱码
        f.write('<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>')
        f.write('<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-MML-AM_SVG"></script>')
        f.write(html_content)

     
    # 优化html中的图片信息
    with codecs.open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, features="lxml")
        image_content = soup.find_all("img")
        for i in image_content:
            i["style"] = "max-width:100%; overflow:hidden;"
        with codecs.open(html_final_file, "w", encoding="utf-8") as g:
            g.write(soup.prettify())
     
    pdfkit.from_file(html_final_file, pdf_file)