Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain_groq import ChatGroq | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.prompts import ChatPromptTemplate | |
from dotenv import load_dotenv | |
import pytesseract | |
from PIL import Image | |
import pdfplumber | |
import docx | |
from io import BytesIO | |
import logging | |
from docx import Document | |
from fpdf import FPDF | |
# Load environment variables | |
load_dotenv() | |
# Initialize logging | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
# Initialize LLM | |
llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192") | |
# OCR Configuration for Pytesseract | |
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path | |
# Enhanced OCR with configurable language option and multi-image support | |
def extract_text_from_images(images, lang="eng"): | |
ocr_text = "" | |
for image in images: | |
try: | |
ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n" | |
except Exception as e: | |
logging.error(f"Error in OCR: {e}") | |
return ocr_text.strip() | |
# Function to extract text, images, tables, and formulas from PDF | |
def extract_pdf_data(pdf_path): | |
data = {"text": "", "tables": [], "images": []} | |
try: | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
data["text"] += page.extract_text() or "" | |
tables = page.extract_tables() | |
for table in tables: | |
data["tables"].append(table) | |
for image in page.images: | |
base_image = pdf.extract_image(image["object_number"]) | |
image_obj = Image.open(BytesIO(base_image["image"])) | |
data["images"].append(image_obj) | |
except Exception as e: | |
logging.error(f"Error processing PDF: {e}") | |
return data | |
# Function to extract text from DOCX files | |
def extract_docx_data(docx_file): | |
try: | |
doc = docx.Document(docx_file) | |
text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()]) | |
return text | |
except Exception as e: | |
logging.error(f"Error extracting DOCX content: {e}") | |
return "" | |
# Function to extract text from plain text files | |
def extract_text_file_data(text_file): | |
try: | |
return text_file.read().decode("utf-8").strip() | |
except Exception as e: | |
logging.error(f"Error extracting TXT content: {e}") | |
return "" | |
# Function to process extracted content (PDF, DOCX, etc.) | |
def process_content(file_data, file_type, lang="eng"): | |
text = "" | |
images = [] | |
if file_type == "pdf": | |
pdf_data = extract_pdf_data(file_data) | |
text = process_pdf_content(pdf_data) | |
images = pdf_data["images"] | |
elif file_type == "docx": | |
text = extract_docx_data(file_data) | |
elif file_type == "txt": | |
text = extract_text_file_data(file_data) | |
elif file_type in ["png", "jpg", "jpeg"]: | |
image = Image.open(file_data) | |
images.append(image) | |
ocr_text = extract_text_from_images(images, lang) | |
return text + "\n" + ocr_text | |
# Function to process PDF content | |
def process_pdf_content(pdf_data): | |
ocr_text = extract_text_from_images(pdf_data["images"]) | |
combined_text = pdf_data["text"] + ocr_text | |
table_text = "" | |
for table in pdf_data["tables"]: | |
table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table] | |
table_text += "\n".join(table_rows) + "\n" | |
return (combined_text + "\n" + table_text).strip() | |
# Function to generate questions | |
def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level): | |
prompt_template = f""" | |
Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content. | |
Subject: {subject_name} | |
Instructor: {instructor} | |
Class: {class_name} | |
Institution: {institution} | |
Syllabus Content: {syllabus_context} | |
Difficulty Levels: | |
- Remember: {difficulty_level.get('Remember', 0)} | |
- Understand: {difficulty_level.get('Understand', 0)} | |
- Apply: {difficulty_level.get('Apply', 0)} | |
- Analyze: {difficulty_level.get('Analyze', 0)} | |
- Evaluate: {difficulty_level.get('Evaluate', 0)} | |
- Create: {difficulty_level.get('Create', 0)} | |
Format questions as follows: | |
Q1. ________________ | |
Q2. ________________ | |
... | |
""" | |
chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser()) | |
try: | |
return chain.invoke({}) | |
except Exception as e: | |
logging.error(f"Error generating {question_type} questions: {e}") | |
return "" | |
# Function to generate answers | |
def generate_answers(questions, syllabus_context): | |
prompt = f""" | |
Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content. | |
Syllabus Content: {syllabus_context} | |
Questions: | |
{questions} | |
Format answers as follows: | |
Answer 1: ________________ | |
Answer 2: ________________ | |
... | |
""" | |
chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser()) | |
try: | |
return chain.invoke({}) | |
except Exception as e: | |
logging.error(f"Error generating answers: {e}") | |
return "" | |
# Function to download as DOCX | |
def download_as_docx(content, file_name="output.docx"): | |
doc = Document() | |
for line in content.split("\n"): | |
doc.add_paragraph(line) | |
buffer = BytesIO() | |
doc.save(buffer) | |
buffer.seek(0) | |
return buffer | |
# Function to download as PDF | |
def download_as_pdf(content, file_name="output.pdf"): | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
for line in content.split("\n"): | |
pdf.cell(200, 10, txt=line, ln=True) | |
buffer = BytesIO() | |
pdf.output(buffer) | |
buffer.seek(0) | |
return buffer | |
# Streamlit app with enhanced UI and multi-image upload support | |
st.title("Bloom's Taxonomy Based Exam Paper Developer") | |
st.markdown(""" | |
### A powerful tool to generate exam questions and answers using AI, based on syllabus content and Bloom's Taxonomy principles. | |
""") | |
# Sidebar Clear Data Button | |
if st.sidebar.button("Clear All Data"): | |
st.session_state.clear() | |
st.success("All data has been cleared. You can now upload a new syllabus.") | |
# Upload Syllabus and Multiple Images | |
uploaded_file = st.sidebar.file_uploader( | |
"Upload Syllabus (PDF, DOCX, TXT)", | |
type=["pdf", "docx", "txt"] | |
) | |
uploaded_images = st.sidebar.file_uploader( | |
"Upload Supplementary Images (PNG, JPG, JPEG)", | |
type=["png", "jpg", "jpeg"], | |
accept_multiple_files=True | |
) | |
# Sidebar Inputs for Subject Name, Instructor, Class, and Institution | |
subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name") | |
instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name") | |
class_name = st.sidebar.text_input("Enter Class Name", "Class Name") | |
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name") | |
# Language Option for OCR | |
ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"]) | |
# Process uploaded file and images | |
if uploaded_file or uploaded_images: | |
# Clear session state when new files are uploaded | |
if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name: | |
st.session_state.clear() | |
st.success("Previous data cleared. Processing new file...") | |
st.session_state.uploaded_filename = uploaded_file.name if uploaded_file else None | |
# Process syllabus file | |
if uploaded_file: | |
file_type = uploaded_file.type.split("/")[-1] | |
if file_type in ["pdf", "docx", "txt"]: | |
syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang) | |
st.session_state.syllabus_text = syllabus_text | |
else: | |
st.error("Unsupported file type. Please upload PDF, DOCX, or TXT files.") | |
# Process images | |
if uploaded_images: | |
image_text = extract_text_from_images([Image.open(img) for img in uploaded_images], lang=ocr_lang) | |
st.session_state.syllabus_text = st.session_state.get("syllabus_text", "") + "\n" + image_text | |
# Preview of Syllabus | |
if "syllabus_text" in st.session_state: | |
st.markdown("### Preview of Extracted Syllabus Content") | |
st.text_area("Extracted Syllabus Content", st.session_state.syllabus_text, height=300) | |
# Inputs for Question Generation | |
if "syllabus_text" in st.session_state: | |
st.markdown("### Generate Questions") | |
question_type = st.selectbox("Select Question Type", ["Multiple Choice", "Short Answer", "Essay"]) | |
num_questions = st.number_input("Number of Questions", min_value=1, max_value=50, value=10) | |
difficulty_levels = { | |
"Remember": st.slider("Remember (%)", 0, 100, 20), | |
"Understand": st.slider("Understand (%)", 0, 100, 20), | |
"Apply": st.slider("Apply (%)", 0, 100, 20), | |
"Analyze": st.slider("Analyze (%)", 0, 100, 20), | |
"Evaluate": st.slider("Evaluate (%)", 0, 100, 10), | |
"Create": st.slider("Create (%)", 0, 100, 10), | |
} | |
if st.button("Generate Questions"): | |
with st.spinner("Generating questions..."): | |
questions = generate_questions( | |
question_type, | |
subject_name, | |
instructor_name, | |
class_name, | |
institution_name, | |
st.session_state.syllabus_text, | |
num_questions, | |
difficulty_levels, | |
) | |
st.session_state.generated_questions = questions | |
st.success("Questions generated successfully!") | |
# Display Generated Questions | |
if "generated_questions" in st.session_state: | |
st.markdown("### Generated Questions") | |
st.text_area("Questions", st.session_state.generated_questions, height=300) | |
if st.button("Generate Answers"): | |
with st.spinner("Generating answers..."): | |
answers = generate_answers( | |
st.session_state.generated_questions, | |
st.session_state.syllabus_text, | |
) | |
st.session_state.generated_answers = answers | |
st.success("Answers generated successfully!") | |
# Display Generated Answers | |
if "generated_answers" in st.session_state: | |
st.markdown("### Generated Answers") | |
st.text_area("Answers", st.session_state.generated_answers, height=300) | |
# Download Options | |
if "generated_questions" in st.session_state or "generated_answers" in st.session_state: | |
st.markdown("### Download Options") | |
download_choice = st.radio("Select Download Format", ["DOCX", "PDF", "TXT"]) | |
content_to_download = "" | |
if "generated_questions" in st.session_state: | |
content_to_download += "Generated Questions:\n" + st.session_state.generated_questions + "\n\n" | |
if "generated_answers" in st.session_state: | |
content_to_download += "Generated Answers:\n" + st.session_state.generated_answers | |
if st.button("Download"): | |
if download_choice == "DOCX": | |
buffer = download_as_docx(content_to_download) | |
st.download_button( | |
label="Download as DOCX", | |
data=buffer, | |
file_name="exam_content.docx", | |
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
) | |
elif download_choice == "PDF": | |
buffer = download_as_pdf(content_to_download) | |
st.download_button( | |
label="Download as PDF", | |
data=buffer, | |
file_name="exam_content.pdf", | |
mime="application/pdf", | |
) | |
elif download_choice == "TXT": | |
buffer = BytesIO(content_to_download.encode("utf-8")) | |
st.download_button( | |
label="Download as TXT", | |
data=buffer, | |
file_name="exam_content.txt", | |
mime="text/plain", | |
) | |