Spaces:

dobinyim
/

aie3-autograder

Paused

App Files Files Community

Dobin Yim commited on Jul 22, 2024

Commit

9046e9c

1 Parent(s): 74aa61e

Final App V0

Browse files

Files changed (7) hide show

.env +2 -0
Dockerfile +11 -0
Excel Review.pdf +0 -0
chainlit.md +14 -0
final.py +333 -0
requirements.txt +24 -0
uploads/fall23.zip +3 -0

.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OPENAI_API_KEY=sk-proj-Ag0GaxKAAre2MFgXdFUWT3BlbkFJEAR66bo4a45j4Sa5DwML
2	+ PYTHONPATH=.

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["chainlit", "run", "final.py", "--port", "7860"]

Excel Review.pdf ADDED Viewed

Binary file (46.6 kB). View file

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Final Project 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

final.py ADDED Viewed

	@@ -0,0 +1,333 @@

+# -*- coding: utf-8 -*-
+"""AIE3final.py
+______
+Automated Grading System for AIE3 Final Project
+______
+"""
+# Import necessary libraries
+import logging
+import sys
+import os
+import re
+import zipfile
+import tempfile
+from typing import List, Dict, Tuple
+from dotenv import load_dotenv
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain_core.messages import AIMessage
+from langchain_openai import OpenAIEmbeddings
+from sentence_transformers import SentenceTransformer
+from qdrant_client import QdrantClient
+from qdrant_client.models import VectorParams, Distance, PointStruct, ScoredPoint
+from docx import Document as DocxDocument
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import getpass
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+import openai
+import json
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+import chainlit as cl
+import asyncio
+# Load environment variables
+load_dotenv()
+OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+openai.api_key = OPENAI_API_KEY
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Define constants
+REFERENCE_DOCUMENT_PATH = './Excel Review.pdf'
+UPLOAD_FOLDER = './uploads'
+# Ensure the upload folder exists
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+def unzip_file(file_path: str, output_dir: str):
+    with zipfile.ZipFile(file_path, 'r') as zip_ref:
+        for member in zip_ref.namelist():
+            if not member.startswith('__MACOSX/'):
+                zip_ref.extract(member, output_dir)
+def read_pdf(file_path: str) -> List[Document]:
+    loader = PyMuPDFLoader(file_path)
+    return loader.load()
+def read_docx(file_path: str) -> Document:
+    doc = DocxDocument(file_path)
+    text = "\n".join([p.text for p in doc.paragraphs])
+    return Document(page_content=text, metadata={"source": file_path})
+def read_files_from_directory(directory: str) -> List[Document]:
+    documents = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if os.path.basename(file_path).startswith('~$'):
+                continue  # Skip temporary files
+            if file_path.endswith('.docx'):
+                documents.append(read_docx(file_path))
+            elif file_path.endswith('.pdf'):
+                documents.extend(read_pdf(file_path))
+    return documents
+def extract_json(message: AIMessage) -> List[dict]:
+    text = message.content
+    pattern = r"```json(.*?)```"
+    matches = re.findall(pattern, text, re.DOTALL)
+    try:
+        return [json.loads(match.strip()) for match in matches]
+    except Exception:
+        raise ValueError(f"Failed to parse: {message}")
+qa_chat_model = ChatOpenAI(
+    model="gpt-4o-mini",
+    temperature=0
+)
+ref_prompt = f"""
+You are given a reference documents. The document contains a mix of instructions, guides, questions, and answers.
+Your task is to go through the reference document and extract questions and answers from the document step-by-step.
+Use the keyword 'Question #' to identify the start of each question.
+Retain the following words until the 'Answer:' as the question.
+Use the keyword 'Answer:' to identify the start of each answer.
+Retain the follwing words until the 'Question:' as the answer, until the end of the document.
+Remove any white spaces such as carriage returns.
+Return the question-answer pairs as a key-value pair as Dict type.
+---
+Reference Document Content:
+{{source}}
+Please extract the question-answer pairs and return them as JSON.
+"""
+ref_prompt_template = ChatPromptTemplate.from_template(ref_prompt)
+ref_generation_chain = ref_prompt_template | qa_chat_model
+student_prompt = f"""
+You are given a student assignment document. The document may contain a mix of instructions, guides, questions, and answers.
+Your task is to go through the student document and extract answers to questions from the document step-by-step.
+Use the reference document as a guide.
+Use the keyword 'Question #' to identify each question.
+Then for its associated values, search the student document for the answer.
+If you do not see any answer in the student document, return 'No answer found'.
+Do not make up any answer.
+Remove any white spaces such as carriage returns.
+Return the original question and the student answer pairs as a key-value pair as Dict type.
+---
+Reference Content:
+{{source}}
+Student Content:
+{{student}}
+Please extract the question-answer pairs and return them as JSON.
+"""
+student_prompt_template = ChatPromptTemplate.from_template(student_prompt)
+student_response_chain = student_prompt_template | qa_chat_model
+def split_documents(documents: List[Document]) -> List[Document]:
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=100,
+        length_function=len,
+        is_separator_regex=False
+    )
+    split_docs = text_splitter.split_documents(documents)
+    total_tokens = sum(len(doc.page_content) for doc in split_docs)  # Approximate token count
+    return split_docs, total_tokens
+def generate_embeddings(docs: List[Document]) -> List[List[float]]:
+    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
+    embeddings = embeddings_model.embed_documents([doc.page_content for doc in docs])
+    total_tokens = sum(len(doc.page_content) for doc in docs)  # Approximate token count
+    return embeddings, total_tokens
+def prepare_files():
+    unzip_file('./uploads/fall23_small.zip', './temp')
+    documents = read_files_from_directory('./temp/fall23_small')
+    reference_document = read_pdf(REFERENCE_DOCUMENT_PATH)
+    return documents, reference_document
+def process_student(documents, reference):
+    test_doc = documents[0]
+    student_result = student_response_chain.invoke({"source": reference.keys(),"student": test_doc })
+    student_gen_tokens = student_result.usage_metadata["total_tokens"]
+    student_result = dict(extract_json(student_result)[0])
+    return student_result, student_gen_tokens
+def process_reference(reference_document):
+    result = ref_generation_chain.invoke({"source": reference_document})
+    ref_gen_tokens = result.usage_metadata["total_tokens"]
+    reference = dict(extract_json(result)[0])
+    answers = {}
+    for key in reference:
+        if key.startswith('Question'):
+            question_number = key.split('#')[1]
+            answer_key = f'Answer #{question_number}'
+            answers[key] = reference[answer_key]
+    return reference, answers, ref_gen_tokens
+def split_docs(answers, student_result):
+    split_reference_docs, ref_tokens = {}, 0
+    split_student_docs, student_tokens = {}, 0
+    for key, value in answers.items():
+        split_docs, tokens = split_documents([Document(page_content=value)])
+        split_reference_docs[key] = split_docs
+        ref_tokens += tokens
+    for key, value in student_result.items():
+        split_docs, tokens = split_documents([Document(page_content=value)])
+        split_student_docs[key] = split_docs
+        student_tokens += tokens
+    reference_embeddings = {key: generate_embeddings(value)[0] for key, value in split_reference_docs.items()}
+    student_embeddings = {key: generate_embeddings(value)[0] for key, value in split_student_docs.items()}
+    return reference_embeddings, student_embeddings, ref_tokens, student_tokens
+def compute_cosine_similarity(reference_embeddings: dict, student_embeddings: dict) -> float:
+    similarity_results = {}
+    for key in reference_embeddings.keys():
+        if key not in student_embeddings:
+            similarity_results[key] = 0
+            continue
+        reference_vector = np.array(reference_embeddings[key]).reshape(1, -1)
+        student_vector = np.array(student_embeddings[key]).reshape(1, -1)
+        if reference_vector.shape[1] != student_vector.shape[1]:
+            min_dim = min(reference_vector.shape[1], student_vector.shape[1])
+            reference_vector = reference_vector[:, :min_dim]
+            student_vector = student_vector[:, :min_dim]
+        similarity = cosine_similarity(reference_vector, student_vector)[0][0]
+        similarity_results[key] = similarity
+    total_similarity = sum(similarity_results.values())
+    num_questions = len(similarity_results)
+    average_similarity = total_similarity / num_questions if num_questions else 0
+    return average_similarity
+def llm_similarity(answers, student_result):
+    score_prompt = f"""
+    You are given two dictionaries representing instructor solution and student answers.
+    Your task is to go through each question to grade the correctness of student answer.
+    Use the keyword 'Question #' to identify each question.
+    Then for its associated values, compare student answer against the instructor answer.
+    If the instructor answer has numerical values, check to make sure the student answer has the same number,
+    whether it is expressed in numbers or text.
+    If you do not see any answer in the student answer, assign score 0 for that answer.
+    For student answer that is similar to instructor, assign a full score of 1.
+    If the student answer is similar enough, assign a partial score of 0.5.
+    Otherwise, assign a score of 0.
+    Return the original question and the student score pairs as a key-value pair as Dict type.
+    ---
+    Reference Content:
+    {{source}}
+    Student Content:
+    {{student}}
+    Please extract the question-answer pairs and return them as JSON.
+    """
+    score_prompt_template = ChatPromptTemplate.from_template(score_prompt)
+    student_score_chain = score_prompt_template | qa_chat_model
+    student_score = student_score_chain.invoke({"source": answers, "student": student_result })
+    llm_score_tokens = student_score.usage_metadata["total_tokens"]
+    student_score = dict(extract_json(student_score)[0])
+    total_score = sum(student_score.values())
+    num_questions = len(student_score)
+    average_score = total_score / num_questions if num_questions else 0
+    return average_score, llm_score_tokens
+def process_data() -> Tuple[float, float, int, int, int]:
+    documents, reference_document = prepare_files()
+    reference, answers, ref_gen_tokens = process_reference(reference_document)
+    student_result, student_gen_tokens = process_student(documents, reference)
+    reference_embeddings, student_embeddings, ref_tokens, student_tokens = split_docs(answers, student_result)
+    student_total_tokens = student_gen_tokens + student_tokens
+    ref_total_tokens = ref_gen_tokens + ref_tokens
+    average_similarity = compute_cosine_similarity(reference_embeddings, student_embeddings)
+    average_score, llm_score_tokens = llm_similarity(answers, student_result)
+    llm_total_tokens = ref_gen_tokens + student_gen_tokens + llm_score_tokens
+    return average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens
+async def process_grading():
+    average_similarity, average_score, ref_total_tokens, student_total_tokens, llm_total_tokens = process_data()
+    await cl.Message(content=f"Total tokens used for reference documents: {ref_total_tokens}").send()
+    await cl.Message(content=f"Total tokens used for student documents: {student_total_tokens}").send()
+    await cl.Message(content=f"Total tokens used by LLM: {llm_total_tokens}").send()
+    await cl.Message(content=f"Score: {average_similarity}").send()
+    await cl.Message(content=f"Average Score: {average_score}").send()
+@cl.on_chat_start
+async def start_chat():
+    await cl.Message(content="Do you want to proceed with the grading? (yes/no)").send()
+# Define a global flag to track the processing state
+user_wants_to_continue = False
+@cl.on_message
+async def on_message(message: cl.Message):
+    global user_wants_to_continue
+    if message.content.lower() == 'yes' and not user_wants_to_continue:
+        # Start processing
+        processing_message = cl.Message(content="Processing files...")
+        await processing_message.send()  # Send the message immediately
+        await asyncio.sleep(0.5)  # Short delay to ensure the message is displayed
+        await process_grading()
+        # Ask user if they want to continue after processing is done
+        user_wants_to_continue = True
+        await cl.Message(content="Do you want to continue? (yes/no)").send()
+    elif user_wants_to_continue:
+        if message.content.lower() == 'yes':
+            user_wants_to_continue = False  # Reset the flag
+            await cl.Message(content="Restarting the app...").send()
+            await asyncio.sleep(1)  # Give time for the message to be sent
+            python = sys.executable
+            os.execl(python, python, *sys.argv)  # Restart the app
+        elif message.content.lower() == 'no':
+            user_wants_to_continue = False  # Reset the flag
+            await cl.Message(content="Okay, thank you for using the grading app. Restarting...").send()
+            await asyncio.sleep(1)  # Give time for the message to be sent
+            python = sys.executable
+            os.execl(python, python, *sys.argv)  # Restart the app
+        else:
+            await cl.Message(content="Invalid response. Please type 'yes' or 'no'.").send()
+    elif message.content.lower() == 'no':
+        await cl.Message(content="Okay, thank you for using the grading app. Restarting...").send()
+        await asyncio.sleep(1)  # Give time for the message to be sent
+        python = sys.executable
+        os.execl(python, python, *sys.argv)  # Restart the app
+    else:
+        await cl.Message(content="Please type 'yes' to start processing or 'no' to exit.").send()

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+chainlit
+langchain
+langchain-core
+langgraph
+langchain-community
+langchain_huggingface
+langchain_openai
+langchain-text-splitters
+peft
+bitsandbytes
+accelerate
+qdrant-client
+python-dotenv
+pymupdf
+huggingface_hub
+pandas
+sentence-transformers
+python-docx
+docx2pdf
+python-dotenv
+transformers
+torch

uploads/fall23.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f83797e21c0153a03ef19ff14315ffff3de8730560610bd7b96e2eb066518930
+size 2092284