import streamlit as st
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import pytesseract
from PIL import Image
import pdfplumber
import docx
from io import BytesIO
import logging

# Load environment variables
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Initialize LLM
llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")

# OCR Configuration for Pytesseract
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path

# Enhanced OCR with configurable language option
def extract_text_from_images(images, lang="eng"):
    ocr_text = ""
    for image in images:
        try:
            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
        except Exception as e:
            logging.error(f"Error in OCR: {e}")
    return ocr_text.strip()

# Function to extract text, images, tables, and formulas from PDF
def extract_pdf_data(pdf_path):
    data = {"text": "", "tables": [], "images": []}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                data["text"] += page.extract_text() or ""
                tables = page.extract_tables()
                for table in tables:
                    data["tables"].append(table)
                for image in page.images:
                    base_image = pdf.extract_image(image["object_number"])
                    image_obj = Image.open(BytesIO(base_image["image"]))
                    data["images"].append(image_obj)
    except Exception as e:
        logging.error(f"Error processing PDF: {e}")
    return data

# Function to extract text from DOCX files
def extract_docx_data(docx_file):
    try:
        doc = docx.Document(docx_file)
        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
        return text
    except Exception as e:
        logging.error(f"Error extracting DOCX content: {e}")
        return ""

# Function to extract text from plain text files
def extract_text_file_data(text_file):
    try:
        return text_file.read().decode("utf-8").strip()
    except Exception as e:
        logging.error(f"Error extracting TXT content: {e}")
        return ""

# Function to process extracted content (PDF, DOCX, etc.)
def process_content(file_data, file_type, lang="eng"):
    text = ""
    images = []
    if file_type == "pdf":
        pdf_data = extract_pdf_data(file_data)
        text = process_pdf_content(pdf_data)
        images = pdf_data["images"]
    elif file_type == "docx":
        text = extract_docx_data(file_data)
    elif file_type == "txt":
        text = extract_text_file_data(file_data)
    elif file_type in ["png", "jpg", "jpeg"]:
        image = Image.open(file_data)
        images.append(image)

    ocr_text = extract_text_from_images(images, lang)
    return text + "\n" + ocr_text

# Function to process PDF content
def process_pdf_content(pdf_data):
    ocr_text = extract_text_from_images(pdf_data["images"])
    combined_text = pdf_data["text"] + ocr_text

    table_text = ""
    for table in pdf_data["tables"]:
        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
        table_text += "\n".join(table_rows) + "\n"

    return (combined_text + "\n" + table_text).strip()

# Function to generate questions
def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
    prompt_template = f"""
    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.

    Subject: {subject_name}
    Instructor: {instructor}
    Class: {class_name}
    Institution: {institution}
    Syllabus Content: {syllabus_context}

    Difficulty Levels:
    - Remember: {difficulty_level.get('Remember', 0)}
    - Understand: {difficulty_level.get('Understand', 0)}
    - Apply: {difficulty_level.get('Apply', 0)}
    - Analyze: {difficulty_level.get('Analyze', 0)}
    - Evaluate: {difficulty_level.get('Evaluate', 0)}
    - Create: {difficulty_level.get('Create', 0)}

    Format questions as follows:
    Q1. ________________

    Q2. ________________

    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating {question_type} questions: {e}")
        return ""

# Function to generate answers
def generate_answers(questions, syllabus_context):
    prompt = f"""
    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.

    Syllabus Content: {syllabus_context}

    Questions:
    {questions}

    Format answers as follows:
    Answer 1: ________________
    Answer 2: ________________
    ...
    """
    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
    try:
        return chain.invoke({})
    except Exception as e:
        logging.error(f"Error generating answers: {e}")
        return ""

# Streamlit app
st.title("Bloom's Taxonomy Based Exam Paper Developer")

# Sidebar Clear Data Button
if st.sidebar.button("Clear All Data"):
    st.session_state.clear()
    st.success("All data has been cleared. You can now upload a new syllabus.")

# Syllabus Upload with Automatic Clearing
uploaded_file = st.sidebar.file_uploader(
    "Upload Syllabus (PDF, DOCX, TXT, Image)",
    type=["pdf", "docx", "txt", "png", "jpg"]
)

# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")

# Language Option for OCR
ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])

if uploaded_file:
    # Clear session state when a new file is uploaded
    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
        st.session_state.clear()
        st.success("Previous data cleared. Processing new file...")

    st.session_state.uploaded_filename = uploaded_file.name
    file_type = uploaded_file.type.split("/")[-1]

    # Validate file type
    if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
        st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
    else:
        syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
        st.session_state.syllabus_text = syllabus_text

# Preview of Syllabus
if "syllabus_text" in st.session_state:
    st.subheader("Syllabus Preview:")
    st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
else:
    st.warning("Please upload a syllabus to begin.")

# Question Type Selection
question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)

if st.sidebar.button("Generate Questions"):
    if "syllabus_text" in st.session_state:
        with st.spinner(f"Generating {question_type}..."):
            syllabus_context = st.session_state.syllabus_text
            st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty)
        st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
    else:
        st.error("Please upload a syllabus before generating questions.")

if st.sidebar.button("Generate Answers for Questions"):
    if "generated_questions" in st.session_state:
        with st.spinner("Generating answers..."):
            syllabus_context = st.session_state.syllabus_text
            st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
        st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
    else:
        st.error("Generate questions first before generating answers.")

if "generated_questions" in st.session_state:
    st.sidebar.download_button(
        label="Download Questions",
        data=st.session_state.generated_questions,
        file_name=f"{subject_name}_questions.txt",
        mime="text/plain",
    )

if "generated_answers" in st.session_state:
    st.sidebar.download_button(
        label="Download Answers",
        data=st.session_state.generated_answers,
        file_name=f"{subject_name}_answers.txt",
        mime="text/plain",
    )

st.markdown(""" 
--- 
**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit. 
""")