Spaces:

ahm14
/

Advanced_Paper

Sleeping

App Files Files Community

ahm14 commited on Jan 19

Commit

f724766

verified ·

1 Parent(s): 80ec719

Rename final_test3.py to app.py

Browse files

Files changed (1) hide show

final_test3.py → app.py +316 -246

final_test3.py → app.py RENAMED Viewed

@@ -1,246 +1,316 @@
-import streamlit as st
-from langchain_groq import ChatGroq
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate
-from dotenv import load_dotenv
-import pytesseract
-from PIL import Image
-import pdfplumber
-import docx
-from io import BytesIO
-import logging
-# Load environment variables
-load_dotenv()
-# Initialize logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Initialize LLM
-llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
-# OCR Configuration for Pytesseract
-pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path
-# Enhanced OCR with configurable language option
-def extract_text_from_images(images, lang="eng"):
-    ocr_text = ""
-    for image in images:
-        try:
-            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
-        except Exception as e:
-            logging.error(f"Error in OCR: {e}")
-    return ocr_text.strip()
-# Function to extract text, images, tables, and formulas from PDF
-def extract_pdf_data(pdf_path):
-    data = {"text": "", "tables": [], "images": []}
-    try:
-        with pdfplumber.open(pdf_path) as pdf:
-            for page in pdf.pages:
-                data["text"] += page.extract_text() or ""
-                tables = page.extract_tables()
-                for table in tables:
-                    data["tables"].append(table)
-                for image in page.images:
-                    base_image = pdf.extract_image(image["object_number"])
-                    image_obj = Image.open(BytesIO(base_image["image"]))
-                    data["images"].append(image_obj)
-    except Exception as e:
-        logging.error(f"Error processing PDF: {e}")
-    return data
-# Function to extract text from DOCX files
-def extract_docx_data(docx_file):
-    try:
-        doc = docx.Document(docx_file)
-        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
-        return text
-    except Exception as e:
-        logging.error(f"Error extracting DOCX content: {e}")
-        return ""
-# Function to extract text from plain text files
-def extract_text_file_data(text_file):
-    try:
-        return text_file.read().decode("utf-8").strip()
-    except Exception as e:
-        logging.error(f"Error extracting TXT content: {e}")
-        return ""
-# Function to process extracted content (PDF, DOCX, etc.)
-def process_content(file_data, file_type, lang="eng"):
-    text = ""
-    images = []
-    if file_type == "pdf":
-        pdf_data = extract_pdf_data(file_data)
-        text = process_pdf_content(pdf_data)
-        images = pdf_data["images"]
-    elif file_type == "docx":
-        text = extract_docx_data(file_data)
-    elif file_type == "txt":
-        text = extract_text_file_data(file_data)
-    elif file_type in ["png", "jpg", "jpeg"]:
-        image = Image.open(file_data)
-        images.append(image)
-    ocr_text = extract_text_from_images(images, lang)
-    return text + "\n" + ocr_text
-# Function to process PDF content
-def process_pdf_content(pdf_data):
-    ocr_text = extract_text_from_images(pdf_data["images"])
-    combined_text = pdf_data["text"] + ocr_text
-    table_text = ""
-    for table in pdf_data["tables"]:
-        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
-        table_text += "\n".join(table_rows) + "\n"
-    return (combined_text + "\n" + table_text).strip()
-# Function to generate questions
-def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
-    prompt_template = f"""
-    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
-    Subject: {subject_name}
-    Instructor: {instructor}
-    Class: {class_name}
-    Institution: {institution}
-    Syllabus Content: {syllabus_context}
-    Difficulty Levels:
-    - Remember: {difficulty_level.get('Remember', 0)}
-    - Understand: {difficulty_level.get('Understand', 0)}
-    - Apply: {difficulty_level.get('Apply', 0)}
-    - Analyze: {difficulty_level.get('Analyze', 0)}
-    - Evaluate: {difficulty_level.get('Evaluate', 0)}
-    - Create: {difficulty_level.get('Create', 0)}
-    Format questions as follows:
-    Q1. ________________
-    Q2. ________________
-    ...
-    """
-    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
-    try:
-        return chain.invoke({})
-    except Exception as e:
-        logging.error(f"Error generating {question_type} questions: {e}")
-        return ""
-# Function to generate answers
-def generate_answers(questions, syllabus_context):
-    prompt = f"""
-    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
-    Syllabus Content: {syllabus_context}
-    Questions:
-    {questions}
-    Format answers as follows:
-    Answer 1: ________________
-    Answer 2: ________________
-    ...
-    """
-    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
-    try:
-        return chain.invoke({})
-    except Exception as e:
-        logging.error(f"Error generating answers: {e}")
-        return ""
-# Streamlit app
-st.title("Bloom's Taxonomy Based Exam Paper Developer")
-# Sidebar Clear Data Button
-if st.sidebar.button("Clear All Data"):
-    st.session_state.clear()
-    st.success("All data has been cleared. You can now upload a new syllabus.")
-# Syllabus Upload with Automatic Clearing
-uploaded_file = st.sidebar.file_uploader(
-    "Upload Syllabus (PDF, DOCX, TXT, Image)",
-    type=["pdf", "docx", "txt", "png", "jpg"]
-)
-# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
-subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
-instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
-class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
-institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
-# Language Option for OCR
-ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
-if uploaded_file:
-    # Clear session state when a new file is uploaded
-    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
-        st.session_state.clear()
-        st.success("Previous data cleared. Processing new file...")
-    st.session_state.uploaded_filename = uploaded_file.name
-    file_type = uploaded_file.type.split("/")[-1]
-    # Validate file type
-    if file_type not in ["pdf", "docx", "txt", "png", "jpg"]:
-        st.error("Unsupported file type. Please upload PDF, DOCX, TXT, or image files.")
-    else:
-        syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
-        st.session_state.syllabus_text = syllabus_text
-# Preview of Syllabus
-if "syllabus_text" in st.session_state:
-    st.subheader("Syllabus Preview:")
-    st.text_area("Extracted Content", st.session_state.syllabus_text[:1000], height=300)
-else:
-    st.warning("Please upload a syllabus to begin.")
-# Question Type Selection
-question_type = st.sidebar.radio("Select Question Type", ("MCQs", "Short Questions", "Long Questions", "Fill in the Blanks", "Case Studies", "Diagram-based"))
-difficulty_levels = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
-difficulty = {level: st.sidebar.slider(level, 0, 5, 1) for level in difficulty_levels}
-num_questions = st.sidebar.number_input("Number of Questions", min_value=1, max_value=50, value=10)
-if st.sidebar.button("Generate Questions"):
-    if "syllabus_text" in st.session_state:
-        with st.spinner(f"Generating {question_type}..."):
-            syllabus_context = st.session_state.syllabus_text
-            st.session_state.generated_questions = generate_questions(question_type, subject_name, instructor_name, class_name, institution_name, syllabus_context, num_questions, difficulty)
-        st.text_area(f"Generated {question_type}", value=st.session_state.generated_questions, height=400)
-    else:
-        st.error("Please upload a syllabus before generating questions.")
-if st.sidebar.button("Generate Answers for Questions"):
-    if "generated_questions" in st.session_state:
-        with st.spinner("Generating answers..."):
-            syllabus_context = st.session_state.syllabus_text
-            st.session_state.generated_answers = generate_answers(st.session_state.generated_questions, syllabus_context)
-        st.text_area("Generated Answers", value=st.session_state.generated_answers, height=400)
-    else:
-        st.error("Generate questions first before generating answers.")
-if "generated_questions" in st.session_state:
-    st.sidebar.download_button(
-        label="Download Questions",
-        data=st.session_state.generated_questions,
-        file_name=f"{subject_name}_questions.txt",
-        mime="text/plain",
-    )
-if "generated_answers" in st.session_state:
-    st.sidebar.download_button(
-        label="Download Answers",
-        data=st.session_state.generated_answers,
-        file_name=f"{subject_name}_answers.txt",
-        mime="text/plain",
-    )
-st.markdown("""
----
-**Advanced Test Paper Generator** - powered by LangChain, Pinecone, and Streamlit.
-""")

+import streamlit as st
+from langchain_groq import ChatGroq
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from dotenv import load_dotenv
+import pytesseract
+from PIL import Image
+import pdfplumber
+import docx
+from io import BytesIO
+import logging
+from docx import Document
+from fpdf import FPDF
+# Load environment variables
+load_dotenv()
+# Initialize logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Initialize LLM
+llm = ChatGroq(temperature=0.5, groq_api_key="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", model_name="llama3-8b-8192")
+# OCR Configuration for Pytesseract
+pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Adjust to your system's path
+# Enhanced OCR with configurable language option and multi-image support
+def extract_text_from_images(images, lang="eng"):
+    ocr_text = ""
+    for image in images:
+        try:
+            ocr_text += pytesseract.image_to_string(image, lang=lang).strip() + "\n"
+        except Exception as e:
+            logging.error(f"Error in OCR: {e}")
+    return ocr_text.strip()
+# Function to extract text, images, tables, and formulas from PDF
+def extract_pdf_data(pdf_path):
+    data = {"text": "", "tables": [], "images": []}
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                data["text"] += page.extract_text() or ""
+                tables = page.extract_tables()
+                for table in tables:
+                    data["tables"].append(table)
+                for image in page.images:
+                    base_image = pdf.extract_image(image["object_number"])
+                    image_obj = Image.open(BytesIO(base_image["image"]))
+                    data["images"].append(image_obj)
+    except Exception as e:
+        logging.error(f"Error processing PDF: {e}")
+    return data
+# Function to extract text from DOCX files
+def extract_docx_data(docx_file):
+    try:
+        doc = docx.Document(docx_file)
+        text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting DOCX content: {e}")
+        return ""
+# Function to extract text from plain text files
+def extract_text_file_data(text_file):
+    try:
+        return text_file.read().decode("utf-8").strip()
+    except Exception as e:
+        logging.error(f"Error extracting TXT content: {e}")
+        return ""
+# Function to process extracted content (PDF, DOCX, etc.)
+def process_content(file_data, file_type, lang="eng"):
+    text = ""
+    images = []
+    if file_type == "pdf":
+        pdf_data = extract_pdf_data(file_data)
+        text = process_pdf_content(pdf_data)
+        images = pdf_data["images"]
+    elif file_type == "docx":
+        text = extract_docx_data(file_data)
+    elif file_type == "txt":
+        text = extract_text_file_data(file_data)
+    elif file_type in ["png", "jpg", "jpeg"]:
+        image = Image.open(file_data)
+        images.append(image)
+    ocr_text = extract_text_from_images(images, lang)
+    return text + "\n" + ocr_text
+# Function to process PDF content
+def process_pdf_content(pdf_data):
+    ocr_text = extract_text_from_images(pdf_data["images"])
+    combined_text = pdf_data["text"] + ocr_text
+    table_text = ""
+    for table in pdf_data["tables"]:
+        table_rows = [" | ".join(str(cell) if cell else "" for cell in row) for row in table]
+        table_text += "\n".join(table_rows) + "\n"
+    return (combined_text + "\n" + table_text).strip()
+# Function to generate questions
+def generate_questions(question_type, subject_name, instructor, class_name, institution, syllabus_context, num_questions, difficulty_level):
+    prompt_template = f"""
+    Based on the following syllabus content, generate {num_questions} {question_type} questions. Ensure the questions are directly derived from the provided syllabus content.
+    Subject: {subject_name}
+    Instructor: {instructor}
+    Class: {class_name}
+    Institution: {institution}
+    Syllabus Content: {syllabus_context}
+    Difficulty Levels:
+    - Remember: {difficulty_level.get('Remember', 0)}
+    - Understand: {difficulty_level.get('Understand', 0)}
+    - Apply: {difficulty_level.get('Apply', 0)}
+    - Analyze: {difficulty_level.get('Analyze', 0)}
+    - Evaluate: {difficulty_level.get('Evaluate', 0)}
+    - Create: {difficulty_level.get('Create', 0)}
+    Format questions as follows:
+    Q1. ________________
+    Q2. ________________
+    ...
+    """
+    chain = (ChatPromptTemplate.from_template(prompt_template) | llm | StrOutputParser())
+    try:
+        return chain.invoke({})
+    except Exception as e:
+        logging.error(f"Error generating {question_type} questions: {e}")
+        return ""
+# Function to generate answers
+def generate_answers(questions, syllabus_context):
+    prompt = f"""
+    Based on the provided syllabus content, generate detailed answers for the following questions. The answers must only be based on the syllabus content.
+    Syllabus Content: {syllabus_context}
+    Questions:
+    {questions}
+    Format answers as follows:
+    Answer 1: ________________
+    Answer 2: ________________
+    ...
+    """
+    chain = (ChatPromptTemplate.from_template(prompt) | llm | StrOutputParser())
+    try:
+        return chain.invoke({})
+    except Exception as e:
+        logging.error(f"Error generating answers: {e}")
+        return ""
+# Function to download as DOCX
+def download_as_docx(content, file_name="output.docx"):
+    doc = Document()
+    for line in content.split("\n"):
+        doc.add_paragraph(line)
+    buffer = BytesIO()
+    doc.save(buffer)
+    buffer.seek(0)
+    return buffer
+# Function to download as PDF
+def download_as_pdf(content, file_name="output.pdf"):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    for line in content.split("\n"):
+        pdf.cell(200, 10, txt=line, ln=True)
+    buffer = BytesIO()
+    pdf.output(buffer)
+    buffer.seek(0)
+    return buffer
+# Streamlit app with enhanced UI and multi-image upload support
+st.title("Bloom's Taxonomy Based Exam Paper Developer")
+st.markdown("""
+### A powerful tool to generate exam questions and answers using AI, based on syllabus content and Bloom's Taxonomy principles.
+""")
+# Sidebar Clear Data Button
+if st.sidebar.button("Clear All Data"):
+    st.session_state.clear()
+    st.success("All data has been cleared. You can now upload a new syllabus.")
+# Upload Syllabus and Multiple Images
+uploaded_file = st.sidebar.file_uploader(
+    "Upload Syllabus (PDF, DOCX, TXT)",
+    type=["pdf", "docx", "txt"]
+)
+uploaded_images = st.sidebar.file_uploader(
+    "Upload Supplementary Images (PNG, JPG, JPEG)",
+    type=["png", "jpg", "jpeg"],
+    accept_multiple_files=True
+)
+# Sidebar Inputs for Subject Name, Instructor, Class, and Institution
+subject_name = st.sidebar.text_input("Enter Subject Name", "Subject Name")
+instructor_name = st.sidebar.text_input("Enter Instructor Name", "Instructor Name")
+class_name = st.sidebar.text_input("Enter Class Name", "Class Name")
+institution_name = st.sidebar.text_input("Enter Institution Name", "Institution Name")
+# Language Option for OCR
+ocr_lang = st.sidebar.selectbox("Select OCR Language", ["eng", "spa", "fra", "deu", "ita"])
+# Process uploaded file and images
+if uploaded_file or uploaded_images:
+    # Clear session state when new files are uploaded
+    if "uploaded_filename" in st.session_state and st.session_state.uploaded_filename != uploaded_file.name:
+        st.session_state.clear()
+        st.success("Previous data cleared. Processing new file...")
+    st.session_state.uploaded_filename = uploaded_file.name if uploaded_file else None
+    # Process syllabus file
+    if uploaded_file:
+        file_type = uploaded_file.type.split("/")[-1]
+        if file_type in ["pdf", "docx", "txt"]:
+            syllabus_text = process_content(uploaded_file, file_type, lang=ocr_lang)
+            st.session_state.syllabus_text = syllabus_text
+        else:
+            st.error("Unsupported file type. Please upload PDF, DOCX, or TXT files.")
+    # Process images
+    if uploaded_images:
+        image_text = extract_text_from_images([Image.open(img) for img in uploaded_images], lang=ocr_lang)
+        st.session_state.syllabus_text = st.session_state.get("syllabus_text", "") + "\n" + image_text
+# Preview of Syllabus
+if "syllabus_text" in st.session_state:
+    st.markdown("### Preview of Extracted Syllabus Content")
+    st.text_area("Extracted Syllabus Content", st.session_state.syllabus_text, height=300)
+# Inputs for Question Generation
+if "syllabus_text" in st.session_state:
+    st.markdown("### Generate Questions")
+    question_type = st.selectbox("Select Question Type", ["Multiple Choice", "Short Answer", "Essay"])
+    num_questions = st.number_input("Number of Questions", min_value=1, max_value=50, value=10)
+    difficulty_levels = {
+        "Remember": st.slider("Remember (%)", 0, 100, 20),
+        "Understand": st.slider("Understand (%)", 0, 100, 20),
+        "Apply": st.slider("Apply (%)", 0, 100, 20),
+        "Analyze": st.slider("Analyze (%)", 0, 100, 20),
+        "Evaluate": st.slider("Evaluate (%)", 0, 100, 10),
+        "Create": st.slider("Create (%)", 0, 100, 10),
+    }
+    if st.button("Generate Questions"):
+        with st.spinner("Generating questions..."):
+            questions = generate_questions(
+                question_type,
+                subject_name,
+                instructor_name,
+                class_name,
+                institution_name,
+                st.session_state.syllabus_text,
+                num_questions,
+                difficulty_levels,
+            )
+            st.session_state.generated_questions = questions
+            st.success("Questions generated successfully!")
+# Display Generated Questions
+if "generated_questions" in st.session_state:
+    st.markdown("### Generated Questions")
+    st.text_area("Questions", st.session_state.generated_questions, height=300)
+    if st.button("Generate Answers"):
+        with st.spinner("Generating answers..."):
+            answers = generate_answers(
+                st.session_state.generated_questions,
+                st.session_state.syllabus_text,
+            )
+            st.session_state.generated_answers = answers
+            st.success("Answers generated successfully!")
+# Display Generated Answers
+if "generated_answers" in st.session_state:
+    st.markdown("### Generated Answers")
+    st.text_area("Answers", st.session_state.generated_answers, height=300)
+# Download Options
+if "generated_questions" in st.session_state or "generated_answers" in st.session_state:
+    st.markdown("### Download Options")
+    download_choice = st.radio("Select Download Format", ["DOCX", "PDF", "TXT"])
+    content_to_download = ""
+    if "generated_questions" in st.session_state:
+        content_to_download += "Generated Questions:\n" + st.session_state.generated_questions + "\n\n"
+    if "generated_answers" in st.session_state:
+        content_to_download += "Generated Answers:\n" + st.session_state.generated_answers
+    if st.button("Download"):
+        if download_choice == "DOCX":
+            buffer = download_as_docx(content_to_download)
+            st.download_button(
+                label="Download as DOCX",
+                data=buffer,
+                file_name="exam_content.docx",
+                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            )
+        elif download_choice == "PDF":
+            buffer = download_as_pdf(content_to_download)
+            st.download_button(
+                label="Download as PDF",
+                data=buffer,
+                file_name="exam_content.pdf",
+                mime="application/pdf",
+            )
+        elif download_choice == "TXT":
+            buffer = BytesIO(content_to_download.encode("utf-8"))
+            st.download_button(
+                label="Download as TXT",
+                data=buffer,
+                file_name="exam_content.txt",
+                mime="text/plain",
+            )