Advanced_Paper / app.py
ahm14's picture
Rename final_test3.py to app.py
f724766 verified
import streamlit as st
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import pytesseract
from PIL import Image
import pdfplumber
import docx
from io import BytesIO
import logging
from docx import Document
from fpdf import FPDF
# Load environment variables
load_dotenv()
# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# Initialize LLM
llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
# OCR Configuration for Pytesseract
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" # Adjust to your system's path
# Enhanced OCR with configurable language option and multi-image support
def extract_text_from_images(images, lang="eng"):
ocr_text = ""
for image in images:
try:
ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
except Exception as e:
logging.error(f"Error in OCR: {e}")
return ocr_text.strip()
# Function to extract text, images, tables, and formulas from PDF
def extract_pdf_data(pdf_path):
data = {"text": "", "tables": [], "images": []}
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
data["text"] += page.extract_text() or ""
tables = page.extract_tables()
for table in tables:
data["tables"].append(table)
for image in page.images:
base_image = pdf.extract_image(image["object_number"])
image_obj = Image.open(BytesIO(base_image["image"]))
data["images"].append(image_obj)
except Exception as e:
logging.error(f"Error processing PDF: {e}")
return data
# Function to extract text from DOCX files
def extract_docx_data(docx_file):
try:
doc = docx.Document(docx_file)
text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
return text
except Exception as e:
logging.error(f"Error extracting DOCX content: {e}")
return ""
# Function to extract text from plain text files
def extract_text_file_data(text_file):
try:
return text_file.read().decode("utf-8").strip()
except Exception as e:
logging.error(f"Error extracting TXT content: {e}")
return ""
# Function to process extracted content (PDF, DOCX, etc.)
def process_content(file_data, file_type, lang="eng"):
text = ""
images = []
if file_type == "pdf":
pdf_data = extract_pdf_data(file_data)
text = process_pdf_content(pdf_data)
images = pdf_data["images"]
elif file_type == "docx":
text = extract_docx_data(file_data)
elif file_type == "txt":
text = extract_text_file_data(file_data)
elif file_type in ["png", "jpg", "jpeg"]:
image = Image.open(file_data)
images.append(image)
ocr_text = extract_text_from_images(images, lang)
return text + "\n" + ocr_text
# Function to process PDF content
def process_pdf_content(pdf_data):
ocr_text = extract_text_from_images(pdf_data["images"])
combined_text = pdf_data["text"] + ocr_text
table_text = ""
for table in pdf_data["tables"]:
table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
table_text += "\n".join(table_rows) + "\n"
return (combined_text + "\n" + table_text).strip()
# Function to generate questions
def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
prompt_template = f"""
Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
Subject: {subject_name}
Instructor: {instructor}
Class: {class_name}
Institution: {institution}
Syllabus Content: {syllabus_context}
Difficulty Levels:
- Remember: {difficulty_level.get('Remember', 0)}
- Understand: {difficulty_level.get('Understand', 0)}
- Apply: {difficulty_level.get('Apply', 0)}
- Analyze: {difficulty_level.get('Analyze', 0)}
- Evaluate: {difficulty_level.get('Evaluate', 0)}
- Create: {difficulty_level.get('Create', 0)}
Format questions as follows:
Q1. ________________
Q2. ________________
...
"""
chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
try:
return chain.invoke({})
except Exception as e:
logging.error(f"Error generating {question_type} questions: {e}")
return ""
# Function to generate answers
def generate_answers(questions, syllabus_context):
prompt = f"""
Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
Syllabus Content: {syllabus_context}
Questions:
{questions}
Format answers as follows:
Answer 1: ________________
Answer 2: ________________
...
"""
chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
try:
return chain.invoke({})
except Exception as e:
logging.error(f"Error generating answers: {e}")
return ""
# Function to download as DOCX
def download_as_docx(content, file_name="output.docx"):
doc = Document()
for line in content.split("\n"):
doc.add_paragraph(line)
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer
# Function to download as PDF
def download_as_pdf(content, file_name="output.pdf"):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
for line in content.split("\n"):
pdf.cell(200, 10, txt=line, ln=True)
buffer = BytesIO()
pdf.output(buffer)
buffer.seek(0)
return buffer
# Streamlit app with enhanced UI and multi-image upload support
st.title("Bloom's Taxonomy Based Exam Paper Developer")
st.markdown("""
### A powerful tool to generate exam questions and answers using AI, based on syllabus content and Bloom's Taxonomy principles.
""")
# Sidebar Clear Data Button
if st.sidebar.button("Clear All Data"):
st.session_state.clear()
st.success("All data has been cleared. You can now upload a new syllabus.")
# Upload Syllabus and Multiple Images
uploaded_file = st.sidebar.file_uploader(
"Upload Syllabus (PDF, DOCX, TXT)",
type=["pdf", "docx", "txt"]
)
uploaded_images = st.sidebar.file_uploader(
"Upload Supplementary Images (PNG, JPG, JPEG)",
type=["png", "jpg", "jpeg"],
accept_multiple_files=True
)
# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
# Language Option for OCR
ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
# Process uploaded file and images
if uploaded_file or uploaded_images:
# Clear session state when new files are uploaded
if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
st.session_state.clear()
st.success("Previous data cleared. Processing new file...")
st.session_state.uploaded_filename = uploaded_file.name if uploaded_file else None
# Process syllabus file
if uploaded_file:
file_type = uploaded_file.type.split("/")[-1]
if file_type in ["pdf", "docx", "txt"]:
syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
st.session_state.syllabus_text = syllabus_text
else:
st.error("Unsupported file type. Please upload PDF, DOCX, or TXT files.")
# Process images
if uploaded_images:
image_text = extract_text_from_images([Image.open(img) for img in uploaded_images], lang=ocr_lang)
st.session_state.syllabus_text = st.session_state.get("syllabus_text", "") + "\n" + image_text
# Preview of Syllabus
if "syllabus_text" in st.session_state:
st.markdown("### Preview of Extracted Syllabus Content")
st.text_area("Extracted Syllabus Content", st.session_state.syllabus_text, height=300)
# Inputs for Question Generation
if "syllabus_text" in st.session_state:
st.markdown("### Generate Questions")
question_type = st.selectbox("Select Question Type", ["Multiple Choice", "Short Answer", "Essay"])
num_questions = st.number_input("Number of Questions", min_value=1, max_value=50, value=10)
difficulty_levels = {
"Remember": st.slider("Remember (%)", 0, 100, 20),
"Understand": st.slider("Understand (%)", 0, 100, 20),
"Apply": st.slider("Apply (%)", 0, 100, 20),
"Analyze": st.slider("Analyze (%)", 0, 100, 20),
"Evaluate": st.slider("Evaluate (%)", 0, 100, 10),
"Create": st.slider("Create (%)", 0, 100, 10),
}
if st.button("Generate Questions"):
with st.spinner("Generating questions..."):
questions = generate_questions(
question_type,
subject_name,
instructor_name,
class_name,
institution_name,
st.session_state.syllabus_text,
num_questions,
difficulty_levels,
)
st.session_state.generated_questions = questions
st.success("Questions generated successfully!")
# Display Generated Questions
if "generated_questions" in st.session_state:
st.markdown("### Generated Questions")
st.text_area("Questions", st.session_state.generated_questions, height=300)
if st.button("Generate Answers"):
with st.spinner("Generating answers..."):
answers = generate_answers(
st.session_state.generated_questions,
st.session_state.syllabus_text,
)
st.session_state.generated_answers = answers
st.success("Answers generated successfully!")
# Display Generated Answers
if "generated_answers" in st.session_state:
st.markdown("### Generated Answers")
st.text_area("Answers", st.session_state.generated_answers, height=300)
# Download Options
if "generated_questions" in st.session_state or "generated_answers" in st.session_state:
st.markdown("### Download Options")
download_choice = st.radio("Select Download Format", ["DOCX", "PDF", "TXT"])
content_to_download = ""
if "generated_questions" in st.session_state:
content_to_download += "Generated Questions:\n" + st.session_state.generated_questions + "\n\n"
if "generated_answers" in st.session_state:
content_to_download += "Generated Answers:\n" + st.session_state.generated_answers
if st.button("Download"):
if download_choice == "DOCX":
buffer = download_as_docx(content_to_download)
st.download_button(
label="Download as DOCX",
data=buffer,
file_name="exam_content.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
elif download_choice == "PDF":
buffer = download_as_pdf(content_to_download)
st.download_button(
label="Download as PDF",
data=buffer,
file_name="exam_content.pdf",
mime="application/pdf",
)
elif download_choice == "TXT":
buffer = BytesIO(content_to_download.encode("utf-8"))
st.download_button(
label="Download as TXT",
data=buffer,
file_name="exam_content.txt",
mime="text/plain",
)