import streamlit as st from langchain_groq import ChatGroq from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from dotenv import load_dotenv import pytesseract from PIL import Image import pdfplumber import docx from io import BytesIO import logging from docx import Document from fpdf import FPDF # Load environment variables load_dotenv() # Initialize logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # Initialize LLM llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192") # OCR Configuration for Pytesseract pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path # Enhanced OCR with configurable language option and multi-image support def extract_text_from_images(images, lang="eng"): ocr_text = "" for image in images: try: ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n" except Exception as e: logging.error(f"Error in OCR: {e}") return ocr_text.strip() # Function to extract text, images, tables, and formulas from PDF def extract_pdf_data(pdf_path): data = {"text": "", "tables": [], "images": []} try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: data["text"] += page.extract_text() or "" tables = page.extract_tables() for table in tables: data["tables"].append(table) for image in page.images: base_image = pdf.extract_image(image["object_number"]) image_obj = Image.open(BytesIO(base_image["image"])) data["images"].append(image_obj) except Exception as e: logging.error(f"Error processing PDF: {e}") return data # Function to extract text from DOCX files def extract_docx_data(docx_file): try: doc = docx.Document(docx_file) text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()]) return text except Exception as e: logging.error(f"Error extracting DOCX content: {e}") return "" # Function to extract text from plain text files def extract_text_file_data(text_file): try: return text_file.read().decode("utf-8").strip() except Exception as e: logging.error(f"Error extracting TXT content: {e}") return "" # Function to process extracted content (PDF, DOCX, etc.) def process_content(file_data, file_type, lang="eng"): text = "" images = [] if file_type == "pdf": pdf_data = extract_pdf_data(file_data) text = process_pdf_content(pdf_data) images = pdf_data["images"] elif file_type == "docx": text = extract_docx_data(file_data) elif file_type == "txt": text = extract_text_file_data(file_data) elif file_type in ["png", "jpg", "jpeg"]: image = Image.open(file_data) images.append(image) ocr_text = extract_text_from_images(images, lang) return text + "\n" + ocr_text # Function to process PDF content def process_pdf_content(pdf_data): ocr_text = extract_text_from_images(pdf_data["images"]) combined_text = pdf_data["text"] + ocr_text table_text = "" for table in pdf_data["tables"]: table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table] table_text += "\n".join(table_rows) + "\n" return (combined_text + "\n" + table_text).strip() # Function to generate questions def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level): prompt_template = f""" Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content. Subject: {subject_name} Instructor: {instructor} Class: {class_name} Institution: {institution} Syllabus Content: {syllabus_context} Difficulty Levels: - Remember: {difficulty_level.get('Remember', 0)} - Understand: {difficulty_level.get('Understand', 0)} - Apply: {difficulty_level.get('Apply', 0)} - Analyze: {difficulty_level.get('Analyze', 0)} - Evaluate: {difficulty_level.get('Evaluate', 0)} - Create: {difficulty_level.get('Create', 0)} Format questions as follows: Q1. ________________ Q2. ________________ ... """ chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser()) try: return chain.invoke({}) except Exception as e: logging.error(f"Error generating {question_type} questions: {e}") return "" # Function to generate answers def generate_answers(questions, syllabus_context): prompt = f""" Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content. Syllabus Content: {syllabus_context} Questions: {questions} Format answers as follows: Answer 1: ________________ Answer 2: ________________ ... """ chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser()) try: return chain.invoke({}) except Exception as e: logging.error(f"Error generating answers: {e}") return "" # Function to download as DOCX def download_as_docx(content, file_name="output.docx"): doc = Document() for line in content.split("\n"): doc.add_paragraph(line) buffer = BytesIO() doc.save(buffer) buffer.seek(0) return buffer # Function to download as PDF def download_as_pdf(content, file_name="output.pdf"): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) for line in content.split("\n"): pdf.cell(200, 10, txt=line, ln=True) buffer = BytesIO() pdf.output(buffer) buffer.seek(0) return buffer # Streamlit app with enhanced UI and multi-image upload support st.title("Bloom's Taxonomy Based Exam Paper Developer") st.markdown(""" ### A powerful tool to generate exam questions and answers using AI, based on syllabus content and Bloom's Taxonomy principles. """) # Sidebar Clear Data Button if st.sidebar.button("Clear All Data"): st.session_state.clear() st.success("All data has been cleared. You can now upload a new syllabus.") # Upload Syllabus and Multiple Images uploaded_file = st.sidebar.file_uploader( "Upload Syllabus (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"] ) uploaded_images = st.sidebar.file_uploader( "Upload Supplementary Images (PNG, JPG, JPEG)", type=["png", "jpg", "jpeg"], accept_multiple_files=True ) # Sidebar Inputs for Subject Name, Instructor, Class, and Institution subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name") instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name") class_name = st.sidebar.text_input("Enter Class Name", "Class Name") institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name") # Language Option for OCR ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"]) # Process uploaded file and images if uploaded_file or uploaded_images: # Clear session state when new files are uploaded if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name: st.session_state.clear() st.success("Previous data cleared. Processing new file...") st.session_state.uploaded_filename = uploaded_file.name if uploaded_file else None # Process syllabus file if uploaded_file: file_type = uploaded_file.type.split("/")[-1] if file_type in ["pdf", "docx", "txt"]: syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang) st.session_state.syllabus_text = syllabus_text else: st.error("Unsupported file type. Please upload PDF, DOCX, or TXT files.") # Process images if uploaded_images: image_text = extract_text_from_images([Image.open(img) for img in uploaded_images], lang=ocr_lang) st.session_state.syllabus_text = st.session_state.get("syllabus_text", "") + "\n" + image_text # Preview of Syllabus if "syllabus_text" in st.session_state: st.markdown("### Preview of Extracted Syllabus Content") st.text_area("Extracted Syllabus Content", st.session_state.syllabus_text, height=300) # Inputs for Question Generation if "syllabus_text" in st.session_state: st.markdown("### Generate Questions") question_type = st.selectbox("Select Question Type", ["Multiple Choice", "Short Answer", "Essay"]) num_questions = st.number_input("Number of Questions", min_value=1, max_value=50, value=10) difficulty_levels = { "Remember": st.slider("Remember (%)", 0, 100, 20), "Understand": st.slider("Understand (%)", 0, 100, 20), "Apply": st.slider("Apply (%)", 0, 100, 20), "Analyze": st.slider("Analyze (%)", 0, 100, 20), "Evaluate": st.slider("Evaluate (%)", 0, 100, 10), "Create": st.slider("Create (%)", 0, 100, 10), } if st.button("Generate Questions"): with st.spinner("Generating questions..."): questions = generate_questions( question_type, subject_name, instructor_name, class_name, institution_name, st.session_state.syllabus_text, num_questions, difficulty_levels, ) st.session_state.generated_questions = questions st.success("Questions generated successfully!") # Display Generated Questions if "generated_questions" in st.session_state: st.markdown("### Generated Questions") st.text_area("Questions", st.session_state.generated_questions, height=300) if st.button("Generate Answers"): with st.spinner("Generating answers..."): answers = generate_answers( st.session_state.generated_questions, st.session_state.syllabus_text, ) st.session_state.generated_answers = answers st.success("Answers generated successfully!") # Display Generated Answers if "generated_answers" in st.session_state: st.markdown("### Generated Answers") st.text_area("Answers", st.session_state.generated_answers, height=300) # Download Options if "generated_questions" in st.session_state or "generated_answers" in st.session_state: st.markdown("### Download Options") download_choice = st.radio("Select Download Format", ["DOCX", "PDF", "TXT"]) content_to_download = "" if "generated_questions" in st.session_state: content_to_download += "Generated Questions:\n" + st.session_state.generated_questions + "\n\n" if "generated_answers" in st.session_state: content_to_download += "Generated Answers:\n" + st.session_state.generated_answers if st.button("Download"): if download_choice == "DOCX": buffer = download_as_docx(content_to_download) st.download_button( label="Download as DOCX", data=buffer, file_name="exam_content.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) elif download_choice == "PDF": buffer = download_as_pdf(content_to_download) st.download_button( label="Download as PDF", data=buffer, file_name="exam_content.pdf", mime="application/pdf", ) elif download_choice == "TXT": buffer = BytesIO(content_to_download.encode("utf-8")) st.download_button( label="Download as TXT", data=buffer, file_name="exam_content.txt", mime="text/plain", )