import streamlit as st from langchain_groq import ChatGroq from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from dotenv import load_dotenv import pytesseract from PIL import Image import pdfplumber import docx from io import BytesIO import logging # Load environment variables load_dotenv() # Initialize logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # Initialize LLM llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192") # OCR Configuration for Pytesseract pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path # Enhanced OCR with configurable language option def extract_text_from_images(images, lang="eng"): ocr_text = "" for image in images: try: ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n" except Exception as e: logging.error(f"Error in OCR: {e}") return ocr_text.strip() # Function to extract text, images, tables, and formulas from PDF def extract_pdf_data(pdf_path): data = {"text": "", "tables": [], "images": []} try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: data["text"] += page.extract_text() or "" tables = page.extract_tables() for table in tables: data["tables"].append(table) for image in page.images: base_image = pdf.extract_image(image["object_number"]) image_obj = Image.open(BytesIO(base_image["image"])) data["images"].append(image_obj) except Exception as e: logging.error(f"Error processing PDF: {e}") return data # Function to extract text from DOCX files def extract_docx_data(docx_file): try: doc = docx.Document(docx_file) text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()]) return text except Exception as e: logging.error(f"Error extracting DOCX content: {e}") return "" # Function to extract text from plain text files def extract_text_file_data(text_file): try: return text_file.read().decode("utf-8").strip() except Exception as e: logging.error(f"Error extracting TXT content: {e}") return "" # Function to process extracted content (PDF, DOCX, etc.) def process_content(file_data, file_type, lang="eng"): text = "" images = [] if file_type == "pdf": pdf_data = extract_pdf_data(file_data) text = process_pdf_content(pdf_data) images = pdf_data["images"] elif file_type == "docx": text = extract_docx_data(file_data) elif file_type == "txt": text = extract_text_file_data(file_data) elif file_type in ["png", "jpg", "jpeg"]: image = Image.open(file_data) images.append(image) ocr_text = extract_text_from_images(images, lang) return text + "\n" + ocr_text # Function to process PDF content def process_pdf_content(pdf_data): ocr_text = extract_text_from_images(pdf_data["images"]) combined_text = pdf_data["text"] + ocr_text table_text = "" for table in pdf_data["tables"]: table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table] table_text += "\n".join(table_rows) + "\n" return (combined_text + "\n" + table_text).strip() # Function to generate questions def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level): prompt_template = f""" Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content. Subject: {subject_name} Instructor: {instructor} Class: {class_name} Institution: {institution} Syllabus Content: {syllabus_context} Difficulty Levels: - Remember: {difficulty_level.get('Remember', 0)} - Understand: {difficulty_level.get('Understand', 0)} - Apply: {difficulty_level.get('Apply', 0)} - Analyze: {difficulty_level.get('Analyze', 0)} - Evaluate: {difficulty_level.get('Evaluate', 0)} - Create: {difficulty_level.get('Create', 0)} Format questions as follows: Q1. ________________ Q2. ________________ ... """ chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser()) try: return chain.invoke({}) except Exception as e: logging.error(f"Error generating {question_type} questions: {e}") return "" # Function to generate answers def generate_answers(questions, syllabus_context): prompt = f""" Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content. Syllabus Content: {syllabus_context} Questions: {questions} Format answers as follows: Answer 1: ________________ Answer 2: ________________ ... """ chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser()) try: return chain.invoke({}) except Exception as e: logging.error(f"Error generating answers: {e}") return "" # Streamlit app st.title("Bloom's Taxonomy Based Exam Paper Developer") # Sidebar Clear Data Button if st.sidebar.button("Clear All Data"): st.session_state.clear() st.success("All data has been cleared. You can now upload a new syllabus.") # Syllabus Upload with Automatic Clearing uploaded_file = st.sidebar.file_uploader( "Upload Syllabus (PDF, DOCX, TXT, Image)", type=["pdf", "docx", "txt", "png", "jpg"] ) # Sidebar Inputs for Subject Name, Instructor, Class, and Institution subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name") instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name") class_name = st.sidebar.text_input("Enter Class Name", "Class Name") institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name") # Language Option for OCR ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"]) if uploaded_file: # Clear session state when a new file is uploaded if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name: st.session_state.clear() st.success("Previous data cleared. Processing new file...") st.session_state.uploaded_filename = uploaded_file.name file_type = uploaded_file.type.split("/")[-1] # Validate file type if file_type not in ["pdf", "docx", "txt", "png", "jpg"]: st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.") else: syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang) st.session_state.syllabus_text = syllabus_text # Preview of Syllabus if "syllabus_text" in st.session_state: st.subheader("Syllabus Preview:") st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300) else: st.warning("Please upload a syllabus to begin.") # Question Type Selection question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based")) difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"] difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels} num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10) if st.sidebar.button("Generate Questions"): if "syllabus_text" in st.session_state: with st.spinner(f"Generating {question_type}..."): syllabus_context = st.session_state.syllabus_text st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty) st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400) else: st.error("Please upload a syllabus before generating questions.") if st.sidebar.button("Generate Answers for Questions"): if "generated_questions" in st.session_state: with st.spinner("Generating answers..."): syllabus_context = st.session_state.syllabus_text st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context) st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400) else: st.error("Generate questions first before generating answers.") if "generated_questions" in st.session_state: st.sidebar.download_button( label="Download Questions", data=st.session_state.generated_questions, file_name=f"{subject_name}_questions.txt", mime="text/plain", ) if "generated_answers" in st.session_state: st.sidebar.download_button( label="Download Answers", data=st.session_state.generated_answers, file_name=f"{subject_name}_answers.txt", mime="text/plain", ) st.markdown(""" --- **Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit. """)