Spaces:

rohan13
/

canvas-discussion-grader-with-feedback

Runtime error

App Files Files Community

rohan13 commited on Jul 19, 2023

Commit

10da927

1 Parent(s): 7728560

Changes for automated grader integrated with a chatbot

Browse files

Files changed (5) hide show

app_new.py +232 -0
grader.py +257 -0
ingest.py +174 -0
requirements.in +11 -0
utils.py +253 -0

app_new.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import asyncio
+import os
+import time
+import glob
+import gradio as gr
+from dotenv import load_dotenv
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings import OpenAIEmbeddings
+from grader import Grader
+from ingest import ingest_canvas_discussions
+from utils import GraderQA
+load_dotenv()
+pickle_file = "vector_stores/canvas-discussions.pkl"
+index_file = "vector_stores/canvas-discussions.index"
+grading_model = 'gpt-4'
+qa_model = 'gpt-3.5-turbo-16k'
+llm = ChatOpenAI(model_name=qa_model, temperature=0, verbose=True)
+embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
+grader = None
+grader_qa = None
+def add_text(history, text):
+    print("Question asked: " + text)
+    response = run_model(text)
+    history = history + [(text, response)]
+    print(history)
+    return history, ""
+def run_model(text):
+    global grader, grader_qa
+    start_time = time.time()
+    print("start time:" + str(start_time))
+    if not grader_qa and not grader:
+        if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(
+                pickle_file) > 0 and os.path.isfile('docs/discussion_entries.json.json') and os.path.isfile(
+            'docs/rubric-data.json') > 0:
+            grader = Grader(qa_model)
+            grader_qa = GraderQA(grader, embeddings)
+    elif not grader_qa:
+        grader.llm.model_name = qa_model
+        grader_qa = GraderQA(grader, embeddings)
+    response = grader_qa.chain(text)
+    sources = []
+    for document in response['source_documents']:
+        sources.append(str(document.metadata))
+        print(sources)
+    source = ','.join(set(sources))
+    response = response['answer'] + '\nSources: ' + source
+    end_time = time.time()
+    # # If response contains string `SOURCES:`, then add a \n before `SOURCES`
+    # if "SOURCES:" in response:
+    #     response = response.replace("SOURCES:", "\nSOURCES:")
+    response = response + "\n\n" + "Time taken: " + str(end_time - start_time)
+    print(response)
+    print("Time taken: " + str(end_time - start_time))
+    return response
+def set_model(history):
+    history = get_first_message(history)
+    return history
+def ingest(url, canvas_api_key, openai_api_key, history):
+    global grader
+    text = f"Download data from {url} and ingest it to grade discussions"
+    ingest_canvas_discussions(url, canvas_api_key)
+    grader = Grader(grading_model)
+    response = "Ingested canvas data successfully"
+    history = history + [(text, response)]
+    return get_grading_status(history)
+def start_grading(url, canvas_api_key, openai_api_key, history):
+    global grader, grader_qa
+    text = f"Start grading discussions from {url}"
+    if not url or not canvas_api_key or not openai_api_key:
+        response = "Please enter all the fields to initiate grading"
+    elif grader:
+        # Create a new event loop
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            # Use the event loop to run the async function
+            loop.run_until_complete(grader.run_chain())
+            grader_qa = GraderQA(grader, embeddings)
+            response = "Grading done"
+        finally:
+            # Close the loop after use
+            loop.close()
+    else:
+        response = "Please ingest data before grading"
+    history = history + [(text, response)]
+    return history
+def start_downloading():
+    # grader.download()
+    return "Downloaded"
+def get_first_message(history):
+    global grader_qa
+    history = [(None,
+                'Get feedback on your canvas discussions. Add your discussion url and get your discussions graded in instantly.')]
+    history = get_grading_status(history)
+    return history
+def get_grading_status(history):
+    global grader, grader_qa
+    # Check if grading is complete
+    if os.path.isdir('output') and len(glob.glob("docs/*.json")) > 0 and len(glob.glob("docs/*.html")) > 0:
+        if not grader:
+            grader = Grader(qa_model)
+            grader_qa = GraderQA(grader, embeddings)
+        elif not grader_qa:
+            grader_qa = GraderQA(grader, embeddings)
+        history = history + [(None, 'Grading is already complete. You can now ask questions')]
+        enable_fields(False, False, False, False, False, True, True, True)
+    # Check if data is ingested
+    elif len(glob.glob("docs/*.json")) > 0 and len(glob.glob("docs/*.html")):
+        if not grader_qa:
+            grader = Grader(qa_model)
+        history = history + [(None, 'Canvas data is already ingested. You can grade discussions now')]
+        enable_fields(False, False, False, False, True, True, False, False)
+    else:
+        history = history + [(None, 'Please ingest data and start grading')]
+        url.disabled = True
+        enable_fields(True, True, True, True, True, True, False, False)
+    return history
+# handle enable/disable of fields
+def enable_fields(url_status, canvas_api_key_status, openai_api_key_status, submit_status, grade_status,
+                  download_status, chatbot_txt_status, chatbot_btn_status):
+    url.interactive = url_status
+    canvas_api_key.interactive = canvas_api_key_status
+    openai_api_key.interactive = openai_api_key_status
+    submit.interactive = submit_status
+    grade.interactive = grade_status
+    download.interactive = download_status
+    txt.interactive = chatbot_txt_status
+    ask.interactive = chatbot_btn_status
+    if not chatbot_txt_status:
+        txt.placeholder = "Please grade discussions first"
+    else:
+        txt.placeholder = "Ask a question"
+    if not url_status:
+        url.placeholder = "Data already ingested"
+    if not canvas_api_key_status:
+        canvas_api_key.placeholder = "Data already ingested"
+    if not openai_api_key_status:
+        openai_api_key.placeholder = "Data already ingested"
+def bot(history):
+    return history
+with gr.Blocks() as demo:
+    gr.Markdown(f"<h2><center>{'Canvas Discussion Grading With Feedback'}</center></h2>")
+    with gr.Row():
+        url = gr.Textbox(
+            label="Canvas Discussion URL",
+            placeholder="Enter your Canvas Discussion URL"
+        )
+        canvas_api_key = gr.Textbox(
+            label="Canvas API Key",
+            placeholder="Enter your Canvas API Key", type="password"
+        )
+        openai_api_key = gr.Textbox(
+            label="OpenAI API Key",
+            placeholder="Enter your OpenAI API Key", type="password"
+        )
+    with gr.Row():
+        submit = gr.Button(value="Submit", variant="secondary", )
+        grade = gr.Button(value="Grade", variant="secondary")
+        download = gr.Button(value="Download", variant="secondary")
+        reset = gr.Button(value="Reset", variant="secondary")
+    chatbot = gr.Chatbot([], label="Chat with grading results", elem_id="chatbot", height=400)
+    with gr.Row():
+        with gr.Column(scale=3):
+            txt = gr.Textbox(
+                label="Ask questions about how students did on the discussion",
+                placeholder="Enter text and press enter, or upload an image", lines=1
+            )
+        ask = gr.Button(value="Ask", variant="secondary", scale=1)
+    chatbot.value = get_first_message([])
+    submit.click(ingest, inputs=[url, canvas_api_key, openai_api_key, chatbot], outputs=[chatbot],
+                 postprocess=False).then(
+        bot, chatbot, chatbot
+    )
+    grade.click(start_grading, inputs=[url, canvas_api_key, openai_api_key, chatbot], outputs=[chatbot],
+                postprocess=False).then(
+        bot, chatbot, chatbot
+    )
+    download.click(start_downloading, inputs=[], outputs=[chatbot], postprocess=False).then(
+        bot, chatbot, chatbot
+    )
+    txt.submit(add_text, [chatbot, txt], [chatbot, txt], postprocess=False).then(
+        bot, chatbot, chatbot
+    )
+    ask.click(add_text, inputs=[chatbot, txt], outputs=[chatbot, txt], postprocess=False,).then(
+        bot, chatbot, chatbot
+    )
+    set_model(chatbot)
+if __name__ == "__main__":
+    demo.queue()
+    demo.queue(concurrency_count=5)
+    demo.launch(debug=True, )

grader.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import asyncio
+import csv
+import glob
+import json
+import shutil
+from datetime import datetime
+from typing import Optional
+from langchain import PromptTemplate
+from langchain.chains import LLMChain, MapReduceChain
+from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain, ReduceDocumentsChain
+from langchain.chains.combine_documents.stuff import StuffDocumentsChain
+from langchain.chains.summarize import load_summarize_chain
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import DirectoryLoader, UnstructuredHTMLLoader
+from langchain.output_parsers import PydanticOutputParser
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, Language
+from pathvalidate import sanitize_filename
+from pydantic import BaseModel, Field
+from tqdm import tqdm
+import os
+class Grader:
+    def __init__(self, model):
+        print("Setting up environment for grading")
+        os.environ["LANGCHAIN_TRACING"] = "true"
+        self.title = None
+        self.model = model
+        self.rubric_file = 'docs/rubric_data.json'
+        self.discussions_file_path = "docs/discussion_entries.json"
+        self.fieldnames = ['student_name', 'total_score', 'student_feedback', 'grader_comments', 'summary']
+        self.docs = self.get_html_files()
+        self.llm = ChatOpenAI(temperature=0, model_name=model)
+        self.parser: PydanticOutputParser = self.create_parser()
+        self.rubric_text = self.create_rubric_text()
+        self.prompt = self.create_prompt()
+        self.splitter = None
+        self.tokens = self.get_num_tokens()
+        self.llm_chain = self.create_llm_chain(model)
+        self.csv = self.get_csv_file_name()
+        self.outputs = []
+        self.completed = 0
+        self.lock = asyncio.Lock()
+    class ToolArgsSchema(BaseModel):
+        student_name: Optional[str] = Field(description="The name of the student")
+        total_score: int = Field(description="The grade of the student's answer")
+        student_feedback: Optional[str] = Field(
+            description="The developmental feedback from Grader's point of view to the student, some examples are: 'Great work, ...', 'Although, your submission is relevant to the question, it doesn't answer the question entirely...'. Give customized feedback based on student's answer")
+        grader_comments: Optional[str] = Field(
+            description="The grade split breakup based on rubric added as grader's one liner customized comments to explain how the grade was calculated for that particular student's answer")
+        summary: Optional[str] = Field(
+            description="The overall summary of the student's answer outlining key points from the student's answer based on the rubric which can be used as a portion of a vectorstore, used to answer summary based questions about all the discussions")
+        class Config:
+            schema_extra = {
+                "required": ["student_name", "total_score", "student_feedback", "grader_comments", "summary"]
+            }
+    def create_parser(self):
+        # print("in parser")
+        return PydanticOutputParser(pydantic_object=self.ToolArgsSchema)
+    def create_rubric_text(self):
+        with open(self.rubric_file, 'r') as file:
+            rubric = json.load(file)
+        rubric_text = []
+        self.title = None  # Initialize title
+        for r in rubric:
+            if 'description' in r and 'ratings' in r:
+                rubric_text.append(f"description:{r['description']}\n" + "\n".join(
+                    [f"points:{rating['points']} points: {rating['description']}" for rating in r['ratings']]))
+            elif 'points_possible' in r:
+                print("added points_possible")
+            elif 'title' in r:  # Check if title exists in rubric
+                self.title = r['title']  # Save title for later use
+                rubric_text.append(f"title:{self.title}")
+            elif 'instruction' in r:
+                rubric_text.append(f"instruction:{r['instruction']}")
+        rubric_text = "\n".join(rubric_text)
+        # print(rubric_text) Add this to log when moving to application
+        return rubric_text
+    def create_map_prompt(self):
+        map_template_string = f"""I am an expert concise Canvas Discussion Summarizer! I am here to concisely summarize the following sections of a long canvas discussion responses of this student on the basis of instructions and rubric provided.
+        The aim is to capture the important and key points on the basis of instructions and rubric provided and create a short summary, so that grading can be done on all the summarized sections of canvas discussion of a student's response.
+        --------------------
+        Following is the canvas instruction and rubric:
+        {self.rubric_text}
+        --------------------
+        I will summarize this extracted part of a long canvas discussion:
+        {{input_documents}}
+        """
+        return PromptTemplate(template=map_template_string, input_variables=["input_documents"])
+    def create_reduce_prompt(self):
+        reduce_template_string = f"""I am a Canvas Discussion Grader! I am here to grade the following summarized sections of canvas discussion responses of the student on the basis of instructions and rubric provided.
+        --------------------
+        To grade student discussion, I will follow the rubric below. I will not deviate from the grading scheme.
+        {self.rubric_text}
+        --------------------
+        I will be able to identify each student by name, their key interests, key features pertinent to the discussion intruction and rubric.
+        I will be able to summarize the entire discussion in concise manner including key points from each student's answer.
+        --------------------
+        I will grade the following summarized canvas discussion: {{input_documents}}
+        --------------------
+        My grading results will ALWAYS be in following format:
+        Format instructions: {{format_instructions}}
+        """
+        return PromptTemplate(
+            template=reduce_template_string,
+            input_variables=["input_documents"],
+            output_parser=self.parser,
+            partial_variables={"format_instructions": self.parser.get_format_instructions()}
+        )
+    def create_map_llm_chain(self):
+        print("Ready to grade!")
+        map_llm_chain = LLMChain(
+            llm=self.llm,
+            prompt=self.map_prompt,
+            verbose=True,
+        )
+        return map_llm_chain
+    def create_reduce_llm_chain(self):
+        reduce_llm_chain = LLMChain(
+            llm=self.llm,
+            prompt=self.reduce_prompt,
+            verbose=True,
+        )
+        return reduce_llm_chain
+    async def process_file(self, file, pbar):
+        if self.model == 'gpt-4':
+            await asyncio.sleep(10)  # Add a 3-second delay before each request
+        result = await self.llm_chain.arun(file)
+        output: self.ToolArgsSchema = self.parser.parse(result)
+        async with self.lock:
+            self.completed += 1
+            pbar.update(1)
+        return result
+    async def run_chain(self):
+        print("Grading Started! Now sit back and get a coffee \u2615")
+        total = len(self.docs)
+        pbar = tqdm(total=total)
+        # if model is gpt-4, batch size is 2, else batch size is 5
+        batch_size = 2 if self.model == 'gpt-4' else 5
+        batches = [self.docs[i:i + batch_size] for i in range(0, len(self.docs), batch_size)]
+        for batch in batches:
+            tasks = [self.process_file(file, pbar) for file in batch]
+            results = await asyncio.gather(*tasks)
+            for result in results:
+                output: self.ToolArgsSchema = self.parser.parse(result)
+                self.outputs.append(output)
+            if self.model == 'gpt-4':
+                await asyncio.sleep(3)  # Add a delay between each batch
+        pbar.close()
+        self.save_csv()
+        return True
+    def create_csv(self):
+        # remove existing csvs in output folder
+        if os.path.exists('output'):
+            shutil.rmtree('output')
+        os.mkdir('output')
+        now = datetime.now()  # current date and time
+        date_time = now.strftime("%m-%d-%Y_%H-%M-%S")
+        if self.title:  # If title exists, use it in the filename
+            file_name = f"{self.title}-{self.llm.model_name}-{date_time}.csv"
+        else:  # If title doesn't exist, use 'output' in the filename
+            file_name = f"output-{self.llm.model_name}-{date_time}.csv"
+        # Sanitize the entire filename
+        sanitized_file_name = sanitize_filename(file_name)
+        sanitized_file_name = os.path.join('output', sanitized_file_name)
+        with open(sanitized_file_name, 'w', newline='') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
+            writer.writeheader()
+        return sanitized_file_name
+    def save_csv(self):
+        # Use the filename created in create_csv method
+        self.csv = self.create_csv()
+        with open(self.csv, 'a', newline='') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
+            rows = [output.dict() for output in self.outputs]  # Convert each output to a dictionary
+            writer.writerows(rows)  # Write all rows to the CSV
+            print(f"Saved grades for {len(self.outputs)} students in {self.csv}")
+            return True
+        return False
+    def get_html_files(self):
+        loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
+        document_list = loader.load()
+        for document in document_list:
+            document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
+            break
+        return document_list
+    def create_prompt(self):
+        # print("in prompt")
+        prompt_template = f"""I am a Canvas Discussion Grader! I am here to grade the following canvas discussion on the basis of instructions and rubric provided.
+        To grade student discussion, I will follow the rubric below. I will not deviate from the grading scheme.
+        {self.rubric_text}
+        I will be able to identify each student by name, identify their key interests, key features of the responses pertinent to the discussion intruction and rubric.
+        I will be able to summarize the entire discussion in concise manner including key points from each student's answer.
+        I will grade the following canvas discussion: {{input_documents}}
+        My grading results will ALWAYS be in following format:
+        Format instructions: {{format_instructions}}
+        """
+        return PromptTemplate(template=prompt_template, input_variables=["input_documents"], output_parser=self.parser,
+                              partial_variables={"format_instructions": self.parser.get_format_instructions()})
+    def create_llm_chain(self, model):
+        print("Ready to grade!")
+        return LLMChain(
+            llm=self.llm,
+            prompt=self.prompt,
+        )
+    def get_num_tokens(self):
+        total_tokens = 0
+        for doc in self.docs:
+            summary_prompt = self.prompt.format(input_documents=doc)
+            num_tokens = self.llm.get_num_tokens(summary_prompt)
+            total_tokens += num_tokens
+            # summary = self.llm(summary_prompt)
+            # print (f"Summary: {summary.strip()}")
+            # print ("\n")
+        return total_tokens
+    def get_csv_file_name(self):
+        output_dir = 'output'
+        if os.path.exists(output_dir):
+            csv_files = glob.glob(os.path.join(output_dir, '*.csv'))
+            if csv_files:
+                return csv_files[0]  # return the first csv file found
+        return None
+def run(model):
+    grader = Grader(model)
+    asyncio.run(grader.run_chain())
+    print("Grading successful")

ingest.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+import re
+import json
+import shutil
+import requests
+from bs4 import BeautifulSoup
+from typing import List
+rubric = None
+message = None
+rubric_file = 'docs/rubric_data.json'
+discussion_entries_file = 'docs/discussion_entries.json'
+class DiscussionEntry:
+    def __init__(self, id: int, parent_id: int, name: str, message: str, replies: List):
+        self.id = id
+        self.parent_id = parent_id
+        self.name = name
+        self.message = message
+        self.replies = replies
+    def to_json(self):
+        return {
+            'id': self.id,
+            'parent_id': self.parent_id,
+            'name': self.name,
+            'message': self.message,
+            'replies': [reply.to_json() for reply in self.replies]
+        }
+    def dump_json(self, filename):
+        with open(filename, 'w') as f:
+            json.dump(self.to_json(), f)
+def extract_entries(entries, participants):
+    result = []
+    for entry in entries:
+        if 'message' in entry and 'deleted' not in entry:
+            id = entry['id']
+            parent_id = entry['parent_id']
+            user_id = entry['user_id']
+            name = next((p['display_name'] for p in participants if p['id'] == user_id), None)
+            message = entry['message']
+            replies = []
+            if 'replies' in entry:
+                replies = extract_entries(entry['replies'], participants)
+            result.append(DiscussionEntry(id, parent_id, name, message, replies))
+    return result
+def save_messages(entries, group_id=None):
+    for entry in entries:
+        filename = f'docs/{entry.name}.html'
+        if group_id is not None:
+            filename = f'docs/group_{group_id}_{entry.name}.html'
+        with open(filename, 'a+') as f:
+            if  entry.parent_id == None:
+                f.write(f'<h1><b>Student Post: {entry.name}</b></h1>')
+                f.write(entry.message)
+                f.write('<hr>')
+            else:
+                f.write(f'<h2><b>Reply to: {entry.parent_id}</b></h2>')
+                f.write(entry.message)
+                f.write('<hr>')
+        save_messages(entry.replies, group_id)
+def extract_group_discussions(group_topic_children, headers):
+    group_entries = []
+    for group_topic in group_topic_children:
+        group_id = group_topic['group_id']
+        topic_id = group_topic['id']
+        group_discussion_url = f'{base_url}/api/v1/groups/{group_id}/discussion_topics/{topic_id}/view'
+        group_discussion_response = requests.get(group_discussion_url, headers=headers)
+        if group_discussion_response.ok:
+            group_discussion_data = group_discussion_response.json()
+            entries = extract_entries(group_discussion_data['view'], group_discussion_data['participants'])
+            # Dump JSON data for group-based discussion
+            with open(discussion_entries_file, 'w') as f:
+                json.dump([entry.to_json() for entry in entries], f)
+            group_entries.append({
+                'group_id': group_id,
+                'entries': entries
+            })
+    return group_entries
+def extract_individual_discussion(discussion_url, headers):
+    individual_entries = []
+    discussion_response = requests.get(discussion_url, headers=headers)
+    if discussion_response.ok:
+        discussion_data = discussion_response.json()
+        entries = extract_entries(discussion_data['view'], discussion_data['participants'])
+        # Dump JSON data for individual discussion
+        with open(discussion_entries_file, 'w') as f:
+            json.dump([entry.to_json() for entry in entries], f)
+        individual_entries.extend(entries)
+    return individual_entries
+def ingest_canvas_discussions(input_url, access_token):
+    global base_url, rubric, message
+    match = re.match(r'https://canvas.illinois.edu/courses/(\d+)/discussion_topics/(\d+)', input_url)
+    if match:
+        course_id, discussion_topic_id = match.groups()
+    else:
+        raise ValueError("Invalid URL")
+    base_url = 'https://canvas.illinois.edu'
+    headers = {
+        'Authorization': f'Bearer {access_token}'
+    }
+    discussion_url = f'{base_url}/api/v1/courses/{course_id}/discussion_topics/{discussion_topic_id}/view'
+    instruction_url = f'{base_url}/api/v1/courses/{course_id}/discussion_topics/{discussion_topic_id}'
+    instruction_response = requests.get(instruction_url, headers=headers)
+    if instruction_response.ok:
+        instruction_data = instruction_response.json()
+        print(instruction_data)
+        rubric = []
+        # Extract title if it exists
+        if 'title' in instruction_data:
+            title = instruction_data['title']
+            rubric = [{'title': title}]
+        if 'description' in instruction_data['assignment']:
+            message_html = instruction_data['assignment']['description']
+            soup = BeautifulSoup(message_html, 'html.parser')
+            message = soup.get_text()
+            rubric.append({'instruction': message})
+        if 'rubric' in instruction_data['assignment'] and 'description' in instruction_data['assignment']:
+            rubric.extend(instruction_data['assignment']['rubric'])
+            if 'points_possible' in instruction_data['assignment']:
+                points_possible = instruction_data['assignment']['points_possible']
+                rubric.append({'points_possible': points_possible})
+            # Check if the docs folder exists
+            if os.path.exists('docs'):
+                #delete the folder
+                shutil.rmtree('docs')
+            # Create the docs folder
+            os.makedirs('docs')
+            with open(rubric_file, 'w') as f:
+                json.dump(rubric, f)
+            print("Extracted instructions and rubric")
+        else:
+            print(f'Error: {instruction_response.text}')
+        # Check if the discussion is an individual discussion with associated group-based discussions
+        if 'group_topic_children' in instruction_data:
+            # Extract and save group-based discussions
+            group_entries = extract_group_discussions(instruction_data['group_topic_children'], headers)
+            os.makedirs('docs', exist_ok=True)
+            print("Extracted group discussion entries")
+            for group_entry in group_entries:
+                save_messages(group_entry['entries'], group_entry['group_id'])
+        else:
+            # Extract and save standalone individual or group-based discussion
+            individual_entries = extract_individual_discussion(discussion_url, headers)
+            print("Extracted individual discussion entries")
+            os.makedirs('docs', exist_ok=True)
+            save_messages(individual_entries)
+    else:
+        print(f'Error: {instruction_response.text}')
+def create_vector_store():
+    return None

requirements.in ADDED Viewed

	@@ -0,0 +1,11 @@

+lanarky
+langchain
+openai
+tiktoken
+faiss-cpu
+gradio
+fastapi
+uvicorn[standard]
+bs4
+pathvalidate
+unstructured

utils.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+from langchain import FAISS
+from langchain.chains import ConversationalRetrievalChain
+from langchain.document_loaders import DirectoryLoader, UnstructuredHTMLLoader, TextLoader, CSVLoader
+from langchain.memory import ConversationSummaryBufferMemory
+from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
+from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
+from grader import Grader
+def search_index_from_docs(source_chunks, embeddings):
+    # print("source chunks: " + str(len(source_chunks)))
+    # print("embeddings: " + str(embeddings))
+    search_index = FAISS.from_documents(source_chunks, embeddings)
+    return search_index
+def get_chat_history(inputs) -> str:
+    res = []
+    for human, ai in inputs:
+        res.append(f"Human:{human}\nAI:{ai}")
+    return "\n".join(res)
+class GraderQA():
+    def __init__(self, grader, embeddings):
+        self.grader = grader
+        self.llm = self.grader.llm
+        self.index_file = "vector_stores/canvas-discussions.faiss"
+        self.pickle_file = "vector_stores/canvas-discussions.pkl"
+        self.rubric_text = grader.rubric_text
+        self.search_index = self.get_search_index(embeddings)
+        self.chain = self.create_chain(embeddings)
+        self.tokens = None
+        self.question = None
+    def get_search_index(self, embeddings):
+        if os.path.isfile(self.pickle_file) and os.path.isfile(self.index_file) and os.path.getsize(
+                self.pickle_file) > 0:
+            # Load index from pickle file
+            search_index = self.load_index(embeddings)
+        else:
+            search_index = self.create_index(embeddings)
+            print("Created index")
+        return search_index
+    def load_index(self, embeddings):
+        # Load index
+        db = FAISS.load_local(
+            folder_path="vector_stores/",
+            index_name="canvas-discussions", embeddings=embeddings,
+        )
+        print("Loaded index")
+        return db
+    def create_index(self, embeddings):
+        source_chunks = self.create_chunk_documents()
+        search_index = search_index_from_docs(source_chunks, embeddings)
+        FAISS.save_local(search_index, folder_path="vector_stores/", index_name="canvas-discussions")
+        return search_index
+    def create_chunk_documents(self):
+        sources = self.fetch_data_for_embeddings()
+        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
+        source_chunks = splitter.split_documents(sources)
+        print("chunks: " + str(len(source_chunks)))
+        print("sources: " + str(len(sources)))
+        return source_chunks
+    def fetch_data_for_embeddings(self):
+        document_list = self.get_csv_files()
+        print("document list: " + str(len(document_list)))
+        return document_list
+    def get_csv_files(self):
+        loader = CSVLoader(file_path=self.grader.csv, source_column="student_name")
+        document_list = loader.load()
+        return document_list
+    def create_chain(self, embeddings):
+        if not self.search_index:
+            self.search_index = self.load_index(embeddings)
+        chain = ConversationalRetrievalChain.from_llm(self.llm, self.search_index.as_retriever(search_type='mmr',
+                                                                                search_kwargs={'lambda_mult': 1,
+                                                                                               'fetch_k': 50,
+                                                                                               'k': 30}),
+                                                      return_source_documents=True,
+                                                      verbose=True,
+                                                      memory=ConversationSummaryBufferMemory(memory_key='chat_history',
+                                                                                             llm=self.llm,
+                                                                                             max_token_limit=40,
+                                                                                             return_messages=True,
+                                                                                             output_key='answer'),
+                                                      get_chat_history=get_chat_history,
+                                                      combine_docs_chain_kwargs={"prompt": self.create_prompt()})
+        return chain
+    def create_prompt(self):
+        system_template = f"""You are Canvas Discussions Grading + Feedback QA Bot. Have a conversation with a human, answering the following questions as best you can.
+        You are a grading assistant who graded the canvas discussions to create the following grading results and feedback. Use the following pieces of the grading results and feedback to answer the users question.
+        Use the following pieces of context to answer the users question.
+        ----------------
+        {self.rubric_text}
+        ----------------
+        {{context}}"""
+        messages = [
+            SystemMessagePromptTemplate.from_template(system_template),
+            HumanMessagePromptTemplate.from_template("{question}"),
+        ]
+        return ChatPromptTemplate.from_messages(messages)
+    def get_tokens(self):
+        total_tokens = 0
+        for doc in self.docs:
+            chat_prompt = self.prompt.format(context=doc, question=self.question)
+            num_tokens = self.llm.get_num_tokens(chat_prompt)
+            total_tokens += num_tokens
+            # summary = self.llm(summary_prompt)
+            # print (f"Summary: {summary.strip()}")
+            # print ("\n")
+        return total_tokens
+    def run_qa_chain(self, question):
+        self.question = question
+        self.get_tokens()
+        answer = self.chain(question)
+        return answer
+# system_template = """You are Canvas Discussions Grading + Feedback QA Bot. Have a conversation with a human, answering the following questions as best you can.
+# You are a grading assistant who graded the canvas discussions to create the following grading results and feedback. Use the following pieces of the grading results and feedback to answer the users question.
+# Use the following pieces of context to answer the users question.
+# ----------------
+# {context}"""
+#
+# messages = [
+#     SystemMessagePromptTemplate.from_template(system_template),
+#     HumanMessagePromptTemplate.from_template("{question}"),
+# ]
+# CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
+#
+#
+# def get_search_index(embeddings):
+#     global vectorstore_index
+#     if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(pickle_file) > 0:
+#         # Load index from pickle file
+#         search_index = load_index(embeddings)
+#     else:
+#         search_index = create_index(model)
+#         print("Created index")
+#
+#     vectorstore_index = search_index
+#     return search_index
+#
+#
+# def create_index(embeddings):
+#     source_chunks = create_chunk_documents()
+#     search_index = search_index_from_docs(source_chunks, embeddings)
+#     # search_index.persist()
+#     FAISS.save_local(search_index, folder_path="vector_stores/", index_name="canvas-discussions")
+#     # Save index to pickle file
+#     # with open(pickle_file, "wb") as f:
+#     #     pickle.dump(search_index, f)
+#     return search_index
+#
+#
+# def search_index_from_docs(source_chunks, embeddings):
+#     # print("source chunks: " + str(len(source_chunks)))
+#     # print("embeddings: " + str(embeddings))
+#     search_index = FAISS.from_documents(source_chunks, embeddings)
+#     return search_index
+#
+#
+# def get_html_files():
+#     loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
+#     document_list = loader.load()
+#     for document in document_list:
+#         document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
+#     return document_list
+#
+#
+# def get_text_files():
+#     loader = DirectoryLoader('docs', glob="**/*.txt", loader_cls=TextLoader, recursive=True)
+#     document_list = loader.load()
+#     return document_list
+#
+#
+# def create_chunk_documents():
+#     sources = fetch_data_for_embeddings()
+#
+#     splitter = RecursiveCharacterTextSplitter.from_language(
+#         language=Language.HTML, chunk_size=500, chunk_overlap=0
+#     )
+#
+#     source_chunks = splitter.split_documents(sources)
+#
+#     print("chunks: " + str(len(source_chunks)))
+#     print("sources: " + str(len(sources)))
+#
+#     return source_chunks
+#
+#
+# def create_chain(question, llm, embeddings):
+#     db = load_index(embeddings)
+#
+#     # Create chain
+#     chain = ConversationalRetrievalChain.from_llm(llm, db.as_retriever(search_type='mmr',
+#                                                                        search_kwargs={'lambda_mult': 1, 'fetch_k': 50,
+#                                                                                       'k': 30}),
+#                                                   return_source_documents=True,
+#                                                   verbose=True,
+#                                                   memory=ConversationSummaryBufferMemory(memory_key='chat_history',
+#                                                                                          llm=llm, max_token_limit=40,
+#                                                                                          return_messages=True,
+#                                                                                          output_key='answer'),
+#                                                   get_chat_history=get_chat_history,
+#                                                   combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
+#
+#     result = chain({"question": question})
+#
+#     sources = []
+#     print(result)
+#
+#     for document in result['source_documents']:
+#         sources.append("\n" + str(document.metadata))
+#         print(sources)
+#
+#     source = ',\n'.join(set(sources))
+#     return result['answer'] + '\nSOURCES: ' + source
+#
+#
+# def load_index(embeddings):
+#     # Load index
+#     db = FAISS.load_local(
+#         folder_path="vector_stores/",
+#         index_name="canvas-discussions", embeddings=embeddings,
+#     )
+#     return db
+#
+#
+# def get_chat_history(inputs) -> str:
+#     res = []
+#     for human, ai in inputs:
+#         res.append(f"Human:{human}\nAI:{ai}")
+#     return "\n".join(res)