rohan13 commited on
Commit
10da927
·
1 Parent(s): 7728560

Changes for automated grader integrated with a chatbot

Browse files
Files changed (5) hide show
  1. app_new.py +232 -0
  2. grader.py +257 -0
  3. ingest.py +174 -0
  4. requirements.in +11 -0
  5. utils.py +253 -0
app_new.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import time
4
+ import glob
5
+ import gradio as gr
6
+ from dotenv import load_dotenv
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.embeddings import OpenAIEmbeddings
9
+
10
+ from grader import Grader
11
+ from ingest import ingest_canvas_discussions
12
+ from utils import GraderQA
13
+
14
+ load_dotenv()
15
+
16
+ pickle_file = "vector_stores/canvas-discussions.pkl"
17
+ index_file = "vector_stores/canvas-discussions.index"
18
+
19
+ grading_model = 'gpt-4'
20
+ qa_model = 'gpt-3.5-turbo-16k'
21
+
22
+ llm = ChatOpenAI(model_name=qa_model, temperature=0, verbose=True)
23
+ embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
24
+
25
+ grader = None
26
+ grader_qa = None
27
+
28
+
29
+ def add_text(history, text):
30
+ print("Question asked: " + text)
31
+ response = run_model(text)
32
+ history = history + [(text, response)]
33
+ print(history)
34
+ return history, ""
35
+
36
+
37
+ def run_model(text):
38
+ global grader, grader_qa
39
+ start_time = time.time()
40
+ print("start time:" + str(start_time))
41
+ if not grader_qa and not grader:
42
+ if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(
43
+ pickle_file) > 0 and os.path.isfile('docs/discussion_entries.json.json') and os.path.isfile(
44
+ 'docs/rubric-data.json') > 0:
45
+ grader = Grader(qa_model)
46
+ grader_qa = GraderQA(grader, embeddings)
47
+ elif not grader_qa:
48
+ grader.llm.model_name = qa_model
49
+ grader_qa = GraderQA(grader, embeddings)
50
+ response = grader_qa.chain(text)
51
+ sources = []
52
+ for document in response['source_documents']:
53
+ sources.append(str(document.metadata))
54
+ print(sources)
55
+
56
+ source = ','.join(set(sources))
57
+ response = response['answer'] + '\nSources: ' + source
58
+ end_time = time.time()
59
+ # # If response contains string `SOURCES:`, then add a \n before `SOURCES`
60
+ # if "SOURCES:" in response:
61
+ # response = response.replace("SOURCES:", "\nSOURCES:")
62
+ response = response + "\n\n" + "Time taken: " + str(end_time - start_time)
63
+ print(response)
64
+ print("Time taken: " + str(end_time - start_time))
65
+ return response
66
+
67
+
68
+ def set_model(history):
69
+ history = get_first_message(history)
70
+ return history
71
+
72
+
73
+ def ingest(url, canvas_api_key, openai_api_key, history):
74
+ global grader
75
+ text = f"Download data from {url} and ingest it to grade discussions"
76
+ ingest_canvas_discussions(url, canvas_api_key)
77
+ grader = Grader(grading_model)
78
+ response = "Ingested canvas data successfully"
79
+ history = history + [(text, response)]
80
+ return get_grading_status(history)
81
+
82
+
83
+ def start_grading(url, canvas_api_key, openai_api_key, history):
84
+ global grader, grader_qa
85
+ text = f"Start grading discussions from {url}"
86
+ if not url or not canvas_api_key or not openai_api_key:
87
+ response = "Please enter all the fields to initiate grading"
88
+ elif grader:
89
+ # Create a new event loop
90
+ loop = asyncio.new_event_loop()
91
+ asyncio.set_event_loop(loop)
92
+ try:
93
+ # Use the event loop to run the async function
94
+ loop.run_until_complete(grader.run_chain())
95
+ grader_qa = GraderQA(grader, embeddings)
96
+ response = "Grading done"
97
+ finally:
98
+ # Close the loop after use
99
+ loop.close()
100
+ else:
101
+ response = "Please ingest data before grading"
102
+ history = history + [(text, response)]
103
+ return history
104
+
105
+
106
+ def start_downloading():
107
+ # grader.download()
108
+ return "Downloaded"
109
+
110
+
111
+ def get_first_message(history):
112
+ global grader_qa
113
+ history = [(None,
114
+ 'Get feedback on your canvas discussions. Add your discussion url and get your discussions graded in instantly.')]
115
+ history = get_grading_status(history)
116
+ return history
117
+
118
+
119
+ def get_grading_status(history):
120
+ global grader, grader_qa
121
+ # Check if grading is complete
122
+ if os.path.isdir('output') and len(glob.glob("docs/*.json")) > 0 and len(glob.glob("docs/*.html")) > 0:
123
+ if not grader:
124
+ grader = Grader(qa_model)
125
+ grader_qa = GraderQA(grader, embeddings)
126
+ elif not grader_qa:
127
+ grader_qa = GraderQA(grader, embeddings)
128
+ history = history + [(None, 'Grading is already complete. You can now ask questions')]
129
+ enable_fields(False, False, False, False, False, True, True, True)
130
+ # Check if data is ingested
131
+ elif len(glob.glob("docs/*.json")) > 0 and len(glob.glob("docs/*.html")):
132
+ if not grader_qa:
133
+ grader = Grader(qa_model)
134
+ history = history + [(None, 'Canvas data is already ingested. You can grade discussions now')]
135
+ enable_fields(False, False, False, False, True, True, False, False)
136
+ else:
137
+ history = history + [(None, 'Please ingest data and start grading')]
138
+ url.disabled = True
139
+ enable_fields(True, True, True, True, True, True, False, False)
140
+ return history
141
+
142
+
143
+ # handle enable/disable of fields
144
+ def enable_fields(url_status, canvas_api_key_status, openai_api_key_status, submit_status, grade_status,
145
+ download_status, chatbot_txt_status, chatbot_btn_status):
146
+ url.interactive = url_status
147
+ canvas_api_key.interactive = canvas_api_key_status
148
+ openai_api_key.interactive = openai_api_key_status
149
+ submit.interactive = submit_status
150
+ grade.interactive = grade_status
151
+ download.interactive = download_status
152
+ txt.interactive = chatbot_txt_status
153
+ ask.interactive = chatbot_btn_status
154
+ if not chatbot_txt_status:
155
+ txt.placeholder = "Please grade discussions first"
156
+ else:
157
+ txt.placeholder = "Ask a question"
158
+ if not url_status:
159
+ url.placeholder = "Data already ingested"
160
+ if not canvas_api_key_status:
161
+ canvas_api_key.placeholder = "Data already ingested"
162
+ if not openai_api_key_status:
163
+ openai_api_key.placeholder = "Data already ingested"
164
+
165
+ def bot(history):
166
+ return history
167
+
168
+
169
+ with gr.Blocks() as demo:
170
+ gr.Markdown(f"<h2><center>{'Canvas Discussion Grading With Feedback'}</center></h2>")
171
+
172
+ with gr.Row():
173
+ url = gr.Textbox(
174
+ label="Canvas Discussion URL",
175
+ placeholder="Enter your Canvas Discussion URL"
176
+ )
177
+
178
+ canvas_api_key = gr.Textbox(
179
+ label="Canvas API Key",
180
+ placeholder="Enter your Canvas API Key", type="password"
181
+ )
182
+
183
+ openai_api_key = gr.Textbox(
184
+ label="OpenAI API Key",
185
+ placeholder="Enter your OpenAI API Key", type="password"
186
+ )
187
+
188
+ with gr.Row():
189
+ submit = gr.Button(value="Submit", variant="secondary", )
190
+ grade = gr.Button(value="Grade", variant="secondary")
191
+ download = gr.Button(value="Download", variant="secondary")
192
+ reset = gr.Button(value="Reset", variant="secondary")
193
+
194
+ chatbot = gr.Chatbot([], label="Chat with grading results", elem_id="chatbot", height=400)
195
+
196
+ with gr.Row():
197
+ with gr.Column(scale=3):
198
+ txt = gr.Textbox(
199
+ label="Ask questions about how students did on the discussion",
200
+ placeholder="Enter text and press enter, or upload an image", lines=1
201
+ )
202
+ ask = gr.Button(value="Ask", variant="secondary", scale=1)
203
+
204
+ chatbot.value = get_first_message([])
205
+ submit.click(ingest, inputs=[url, canvas_api_key, openai_api_key, chatbot], outputs=[chatbot],
206
+ postprocess=False).then(
207
+ bot, chatbot, chatbot
208
+ )
209
+
210
+ grade.click(start_grading, inputs=[url, canvas_api_key, openai_api_key, chatbot], outputs=[chatbot],
211
+ postprocess=False).then(
212
+ bot, chatbot, chatbot
213
+ )
214
+
215
+ download.click(start_downloading, inputs=[], outputs=[chatbot], postprocess=False).then(
216
+ bot, chatbot, chatbot
217
+ )
218
+
219
+ txt.submit(add_text, [chatbot, txt], [chatbot, txt], postprocess=False).then(
220
+ bot, chatbot, chatbot
221
+ )
222
+
223
+ ask.click(add_text, inputs=[chatbot, txt], outputs=[chatbot, txt], postprocess=False,).then(
224
+ bot, chatbot, chatbot
225
+ )
226
+
227
+ set_model(chatbot)
228
+
229
+ if __name__ == "__main__":
230
+ demo.queue()
231
+ demo.queue(concurrency_count=5)
232
+ demo.launch(debug=True, )
grader.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import csv
3
+ import glob
4
+ import json
5
+ import shutil
6
+ from datetime import datetime
7
+ from typing import Optional
8
+
9
+ from langchain import PromptTemplate
10
+ from langchain.chains import LLMChain, MapReduceChain
11
+ from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain, ReduceDocumentsChain
12
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
13
+ from langchain.chains.summarize import load_summarize_chain
14
+ from langchain.chat_models import ChatOpenAI
15
+ from langchain.document_loaders import DirectoryLoader, UnstructuredHTMLLoader
16
+ from langchain.output_parsers import PydanticOutputParser
17
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, Language
18
+ from pathvalidate import sanitize_filename
19
+ from pydantic import BaseModel, Field
20
+ from tqdm import tqdm
21
+ import os
22
+
23
+
24
+ class Grader:
25
+ def __init__(self, model):
26
+ print("Setting up environment for grading")
27
+ os.environ["LANGCHAIN_TRACING"] = "true"
28
+ self.title = None
29
+ self.model = model
30
+ self.rubric_file = 'docs/rubric_data.json'
31
+ self.discussions_file_path = "docs/discussion_entries.json"
32
+ self.fieldnames = ['student_name', 'total_score', 'student_feedback', 'grader_comments', 'summary']
33
+ self.docs = self.get_html_files()
34
+ self.llm = ChatOpenAI(temperature=0, model_name=model)
35
+ self.parser: PydanticOutputParser = self.create_parser()
36
+ self.rubric_text = self.create_rubric_text()
37
+ self.prompt = self.create_prompt()
38
+ self.splitter = None
39
+ self.tokens = self.get_num_tokens()
40
+ self.llm_chain = self.create_llm_chain(model)
41
+ self.csv = self.get_csv_file_name()
42
+ self.outputs = []
43
+ self.completed = 0
44
+ self.lock = asyncio.Lock()
45
+
46
+ class ToolArgsSchema(BaseModel):
47
+ student_name: Optional[str] = Field(description="The name of the student")
48
+ total_score: int = Field(description="The grade of the student's answer")
49
+ student_feedback: Optional[str] = Field(
50
+ description="The developmental feedback from Grader's point of view to the student, some examples are: 'Great work, ...', 'Although, your submission is relevant to the question, it doesn't answer the question entirely...'. Give customized feedback based on student's answer")
51
+ grader_comments: Optional[str] = Field(
52
+ description="The grade split breakup based on rubric added as grader's one liner customized comments to explain how the grade was calculated for that particular student's answer")
53
+ summary: Optional[str] = Field(
54
+ description="The overall summary of the student's answer outlining key points from the student's answer based on the rubric which can be used as a portion of a vectorstore, used to answer summary based questions about all the discussions")
55
+
56
+ class Config:
57
+ schema_extra = {
58
+ "required": ["student_name", "total_score", "student_feedback", "grader_comments", "summary"]
59
+ }
60
+
61
+ def create_parser(self):
62
+ # print("in parser")
63
+ return PydanticOutputParser(pydantic_object=self.ToolArgsSchema)
64
+
65
+ def create_rubric_text(self):
66
+ with open(self.rubric_file, 'r') as file:
67
+ rubric = json.load(file)
68
+ rubric_text = []
69
+ self.title = None # Initialize title
70
+ for r in rubric:
71
+ if 'description' in r and 'ratings' in r:
72
+ rubric_text.append(f"description:{r['description']}\n" + "\n".join(
73
+ [f"points:{rating['points']} points: {rating['description']}" for rating in r['ratings']]))
74
+ elif 'points_possible' in r:
75
+ print("added points_possible")
76
+ elif 'title' in r: # Check if title exists in rubric
77
+ self.title = r['title'] # Save title for later use
78
+ rubric_text.append(f"title:{self.title}")
79
+ elif 'instruction' in r:
80
+ rubric_text.append(f"instruction:{r['instruction']}")
81
+
82
+ rubric_text = "\n".join(rubric_text)
83
+ # print(rubric_text) Add this to log when moving to application
84
+ return rubric_text
85
+
86
+
87
+ def create_map_prompt(self):
88
+ map_template_string = f"""I am an expert concise Canvas Discussion Summarizer! I am here to concisely summarize the following sections of a long canvas discussion responses of this student on the basis of instructions and rubric provided.
89
+ The aim is to capture the important and key points on the basis of instructions and rubric provided and create a short summary, so that grading can be done on all the summarized sections of canvas discussion of a student's response.
90
+ --------------------
91
+ Following is the canvas instruction and rubric:
92
+ {self.rubric_text}
93
+ --------------------
94
+ I will summarize this extracted part of a long canvas discussion:
95
+ {{input_documents}}
96
+ """
97
+ return PromptTemplate(template=map_template_string, input_variables=["input_documents"])
98
+
99
+ def create_reduce_prompt(self):
100
+ reduce_template_string = f"""I am a Canvas Discussion Grader! I am here to grade the following summarized sections of canvas discussion responses of the student on the basis of instructions and rubric provided.
101
+ --------------------
102
+ To grade student discussion, I will follow the rubric below. I will not deviate from the grading scheme.
103
+ {self.rubric_text}
104
+ --------------------
105
+ I will be able to identify each student by name, their key interests, key features pertinent to the discussion intruction and rubric.
106
+ I will be able to summarize the entire discussion in concise manner including key points from each student's answer.
107
+ --------------------
108
+ I will grade the following summarized canvas discussion: {{input_documents}}
109
+ --------------------
110
+ My grading results will ALWAYS be in following format:
111
+ Format instructions: {{format_instructions}}
112
+ """
113
+ return PromptTemplate(
114
+ template=reduce_template_string,
115
+ input_variables=["input_documents"],
116
+ output_parser=self.parser,
117
+ partial_variables={"format_instructions": self.parser.get_format_instructions()}
118
+ )
119
+
120
+ def create_map_llm_chain(self):
121
+ print("Ready to grade!")
122
+ map_llm_chain = LLMChain(
123
+ llm=self.llm,
124
+ prompt=self.map_prompt,
125
+ verbose=True,
126
+ )
127
+ return map_llm_chain
128
+
129
+ def create_reduce_llm_chain(self):
130
+ reduce_llm_chain = LLMChain(
131
+ llm=self.llm,
132
+ prompt=self.reduce_prompt,
133
+ verbose=True,
134
+ )
135
+ return reduce_llm_chain
136
+
137
+ async def process_file(self, file, pbar):
138
+ if self.model == 'gpt-4':
139
+ await asyncio.sleep(10) # Add a 3-second delay before each request
140
+ result = await self.llm_chain.arun(file)
141
+ output: self.ToolArgsSchema = self.parser.parse(result)
142
+ async with self.lock:
143
+ self.completed += 1
144
+ pbar.update(1)
145
+ return result
146
+
147
+ async def run_chain(self):
148
+ print("Grading Started! Now sit back and get a coffee \u2615")
149
+ total = len(self.docs)
150
+ pbar = tqdm(total=total)
151
+ # if model is gpt-4, batch size is 2, else batch size is 5
152
+ batch_size = 2 if self.model == 'gpt-4' else 5
153
+ batches = [self.docs[i:i + batch_size] for i in range(0, len(self.docs), batch_size)]
154
+ for batch in batches:
155
+ tasks = [self.process_file(file, pbar) for file in batch]
156
+ results = await asyncio.gather(*tasks)
157
+ for result in results:
158
+ output: self.ToolArgsSchema = self.parser.parse(result)
159
+ self.outputs.append(output)
160
+ if self.model == 'gpt-4':
161
+ await asyncio.sleep(3) # Add a delay between each batch
162
+ pbar.close()
163
+ self.save_csv()
164
+ return True
165
+
166
+ def create_csv(self):
167
+ # remove existing csvs in output folder
168
+ if os.path.exists('output'):
169
+ shutil.rmtree('output')
170
+
171
+ os.mkdir('output')
172
+ now = datetime.now() # current date and time
173
+ date_time = now.strftime("%m-%d-%Y_%H-%M-%S")
174
+ if self.title: # If title exists, use it in the filename
175
+ file_name = f"{self.title}-{self.llm.model_name}-{date_time}.csv"
176
+ else: # If title doesn't exist, use 'output' in the filename
177
+ file_name = f"output-{self.llm.model_name}-{date_time}.csv"
178
+
179
+ # Sanitize the entire filename
180
+ sanitized_file_name = sanitize_filename(file_name)
181
+ sanitized_file_name = os.path.join('output', sanitized_file_name)
182
+
183
+ with open(sanitized_file_name, 'w', newline='') as csvfile:
184
+ writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
185
+ writer.writeheader()
186
+ return sanitized_file_name
187
+
188
+ def save_csv(self):
189
+ # Use the filename created in create_csv method
190
+ self.csv = self.create_csv()
191
+ with open(self.csv, 'a', newline='') as csvfile:
192
+ writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames)
193
+ rows = [output.dict() for output in self.outputs] # Convert each output to a dictionary
194
+ writer.writerows(rows) # Write all rows to the CSV
195
+ print(f"Saved grades for {len(self.outputs)} students in {self.csv}")
196
+ return True
197
+ return False
198
+
199
+ def get_html_files(self):
200
+ loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
201
+ document_list = loader.load()
202
+ for document in document_list:
203
+ document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
204
+ break
205
+ return document_list
206
+
207
+ def create_prompt(self):
208
+ # print("in prompt")
209
+ prompt_template = f"""I am a Canvas Discussion Grader! I am here to grade the following canvas discussion on the basis of instructions and rubric provided.
210
+ To grade student discussion, I will follow the rubric below. I will not deviate from the grading scheme.
211
+ {self.rubric_text}
212
+
213
+ I will be able to identify each student by name, identify their key interests, key features of the responses pertinent to the discussion intruction and rubric.
214
+ I will be able to summarize the entire discussion in concise manner including key points from each student's answer.
215
+ I will grade the following canvas discussion: {{input_documents}}
216
+
217
+ My grading results will ALWAYS be in following format:
218
+ Format instructions: {{format_instructions}}
219
+ """
220
+ return PromptTemplate(template=prompt_template, input_variables=["input_documents"], output_parser=self.parser,
221
+ partial_variables={"format_instructions": self.parser.get_format_instructions()})
222
+
223
+ def create_llm_chain(self, model):
224
+ print("Ready to grade!")
225
+
226
+ return LLMChain(
227
+ llm=self.llm,
228
+ prompt=self.prompt,
229
+ )
230
+
231
+ def get_num_tokens(self):
232
+ total_tokens = 0
233
+ for doc in self.docs:
234
+ summary_prompt = self.prompt.format(input_documents=doc)
235
+
236
+ num_tokens = self.llm.get_num_tokens(summary_prompt)
237
+ total_tokens += num_tokens
238
+
239
+ # summary = self.llm(summary_prompt)
240
+
241
+ # print (f"Summary: {summary.strip()}")
242
+ # print ("\n")
243
+ return total_tokens
244
+
245
+ def get_csv_file_name(self):
246
+ output_dir = 'output'
247
+ if os.path.exists(output_dir):
248
+ csv_files = glob.glob(os.path.join(output_dir, '*.csv'))
249
+ if csv_files:
250
+ return csv_files[0] # return the first csv file found
251
+ return None
252
+
253
+
254
+ def run(model):
255
+ grader = Grader(model)
256
+ asyncio.run(grader.run_chain())
257
+ print("Grading successful")
ingest.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import shutil
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from typing import List
9
+
10
+ rubric = None
11
+ message = None
12
+ rubric_file = 'docs/rubric_data.json'
13
+ discussion_entries_file = 'docs/discussion_entries.json'
14
+
15
+ class DiscussionEntry:
16
+ def __init__(self, id: int, parent_id: int, name: str, message: str, replies: List):
17
+ self.id = id
18
+ self.parent_id = parent_id
19
+ self.name = name
20
+ self.message = message
21
+ self.replies = replies
22
+
23
+ def to_json(self):
24
+ return {
25
+ 'id': self.id,
26
+ 'parent_id': self.parent_id,
27
+ 'name': self.name,
28
+ 'message': self.message,
29
+ 'replies': [reply.to_json() for reply in self.replies]
30
+ }
31
+
32
+ def dump_json(self, filename):
33
+ with open(filename, 'w') as f:
34
+ json.dump(self.to_json(), f)
35
+
36
+ def extract_entries(entries, participants):
37
+ result = []
38
+ for entry in entries:
39
+ if 'message' in entry and 'deleted' not in entry:
40
+ id = entry['id']
41
+ parent_id = entry['parent_id']
42
+ user_id = entry['user_id']
43
+ name = next((p['display_name'] for p in participants if p['id'] == user_id), None)
44
+ message = entry['message']
45
+ replies = []
46
+ if 'replies' in entry:
47
+ replies = extract_entries(entry['replies'], participants)
48
+ result.append(DiscussionEntry(id, parent_id, name, message, replies))
49
+ return result
50
+
51
+ def save_messages(entries, group_id=None):
52
+
53
+ for entry in entries:
54
+ filename = f'docs/{entry.name}.html'
55
+ if group_id is not None:
56
+ filename = f'docs/group_{group_id}_{entry.name}.html'
57
+
58
+ with open(filename, 'a+') as f:
59
+ if entry.parent_id == None:
60
+ f.write(f'<h1><b>Student Post: {entry.name}</b></h1>')
61
+ f.write(entry.message)
62
+ f.write('<hr>')
63
+ else:
64
+ f.write(f'<h2><b>Reply to: {entry.parent_id}</b></h2>')
65
+ f.write(entry.message)
66
+ f.write('<hr>')
67
+
68
+ save_messages(entry.replies, group_id)
69
+
70
+ def extract_group_discussions(group_topic_children, headers):
71
+ group_entries = []
72
+ for group_topic in group_topic_children:
73
+ group_id = group_topic['group_id']
74
+ topic_id = group_topic['id']
75
+ group_discussion_url = f'{base_url}/api/v1/groups/{group_id}/discussion_topics/{topic_id}/view'
76
+ group_discussion_response = requests.get(group_discussion_url, headers=headers)
77
+ if group_discussion_response.ok:
78
+ group_discussion_data = group_discussion_response.json()
79
+ entries = extract_entries(group_discussion_data['view'], group_discussion_data['participants'])
80
+ # Dump JSON data for group-based discussion
81
+ with open(discussion_entries_file, 'w') as f:
82
+ json.dump([entry.to_json() for entry in entries], f)
83
+ group_entries.append({
84
+ 'group_id': group_id,
85
+ 'entries': entries
86
+ })
87
+ return group_entries
88
+
89
+ def extract_individual_discussion(discussion_url, headers):
90
+ individual_entries = []
91
+ discussion_response = requests.get(discussion_url, headers=headers)
92
+ if discussion_response.ok:
93
+ discussion_data = discussion_response.json()
94
+ entries = extract_entries(discussion_data['view'], discussion_data['participants'])
95
+ # Dump JSON data for individual discussion
96
+ with open(discussion_entries_file, 'w') as f:
97
+ json.dump([entry.to_json() for entry in entries], f)
98
+ individual_entries.extend(entries)
99
+ return individual_entries
100
+
101
+
102
+ def ingest_canvas_discussions(input_url, access_token):
103
+ global base_url, rubric, message
104
+ match = re.match(r'https://canvas.illinois.edu/courses/(\d+)/discussion_topics/(\d+)', input_url)
105
+ if match:
106
+ course_id, discussion_topic_id = match.groups()
107
+ else:
108
+ raise ValueError("Invalid URL")
109
+ base_url = 'https://canvas.illinois.edu'
110
+ headers = {
111
+ 'Authorization': f'Bearer {access_token}'
112
+ }
113
+ discussion_url = f'{base_url}/api/v1/courses/{course_id}/discussion_topics/{discussion_topic_id}/view'
114
+ instruction_url = f'{base_url}/api/v1/courses/{course_id}/discussion_topics/{discussion_topic_id}'
115
+ instruction_response = requests.get(instruction_url, headers=headers)
116
+ if instruction_response.ok:
117
+ instruction_data = instruction_response.json()
118
+ print(instruction_data)
119
+ rubric = []
120
+
121
+ # Extract title if it exists
122
+ if 'title' in instruction_data:
123
+ title = instruction_data['title']
124
+ rubric = [{'title': title}]
125
+
126
+ if 'description' in instruction_data['assignment']:
127
+ message_html = instruction_data['assignment']['description']
128
+ soup = BeautifulSoup(message_html, 'html.parser')
129
+ message = soup.get_text()
130
+ rubric.append({'instruction': message})
131
+
132
+ if 'rubric' in instruction_data['assignment'] and 'description' in instruction_data['assignment']:
133
+ rubric.extend(instruction_data['assignment']['rubric'])
134
+
135
+ if 'points_possible' in instruction_data['assignment']:
136
+ points_possible = instruction_data['assignment']['points_possible']
137
+ rubric.append({'points_possible': points_possible})
138
+
139
+ # Check if the docs folder exists
140
+ if os.path.exists('docs'):
141
+ #delete the folder
142
+ shutil.rmtree('docs')
143
+
144
+ # Create the docs folder
145
+ os.makedirs('docs')
146
+ with open(rubric_file, 'w') as f:
147
+ json.dump(rubric, f)
148
+
149
+ print("Extracted instructions and rubric")
150
+ else:
151
+ print(f'Error: {instruction_response.text}')
152
+
153
+ # Check if the discussion is an individual discussion with associated group-based discussions
154
+ if 'group_topic_children' in instruction_data:
155
+ # Extract and save group-based discussions
156
+ group_entries = extract_group_discussions(instruction_data['group_topic_children'], headers)
157
+ os.makedirs('docs', exist_ok=True)
158
+ print("Extracted group discussion entries")
159
+ for group_entry in group_entries:
160
+ save_messages(group_entry['entries'], group_entry['group_id'])
161
+ else:
162
+ # Extract and save standalone individual or group-based discussion
163
+ individual_entries = extract_individual_discussion(discussion_url, headers)
164
+ print("Extracted individual discussion entries")
165
+ os.makedirs('docs', exist_ok=True)
166
+ save_messages(individual_entries)
167
+
168
+ else:
169
+ print(f'Error: {instruction_response.text}')
170
+
171
+
172
+ def create_vector_store():
173
+
174
+ return None
requirements.in ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lanarky
2
+ langchain
3
+ openai
4
+ tiktoken
5
+ faiss-cpu
6
+ gradio
7
+ fastapi
8
+ uvicorn[standard]
9
+ bs4
10
+ pathvalidate
11
+ unstructured
utils.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain import FAISS
4
+ from langchain.chains import ConversationalRetrievalChain
5
+ from langchain.document_loaders import DirectoryLoader, UnstructuredHTMLLoader, TextLoader, CSVLoader
6
+ from langchain.memory import ConversationSummaryBufferMemory
7
+ from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
9
+
10
+ from grader import Grader
11
+
12
+
13
+ def search_index_from_docs(source_chunks, embeddings):
14
+ # print("source chunks: " + str(len(source_chunks)))
15
+ # print("embeddings: " + str(embeddings))
16
+ search_index = FAISS.from_documents(source_chunks, embeddings)
17
+ return search_index
18
+
19
+
20
+ def get_chat_history(inputs) -> str:
21
+ res = []
22
+ for human, ai in inputs:
23
+ res.append(f"Human:{human}\nAI:{ai}")
24
+ return "\n".join(res)
25
+
26
+
27
+ class GraderQA():
28
+ def __init__(self, grader, embeddings):
29
+ self.grader = grader
30
+ self.llm = self.grader.llm
31
+ self.index_file = "vector_stores/canvas-discussions.faiss"
32
+ self.pickle_file = "vector_stores/canvas-discussions.pkl"
33
+ self.rubric_text = grader.rubric_text
34
+ self.search_index = self.get_search_index(embeddings)
35
+ self.chain = self.create_chain(embeddings)
36
+ self.tokens = None
37
+ self.question = None
38
+
39
+ def get_search_index(self, embeddings):
40
+ if os.path.isfile(self.pickle_file) and os.path.isfile(self.index_file) and os.path.getsize(
41
+ self.pickle_file) > 0:
42
+ # Load index from pickle file
43
+ search_index = self.load_index(embeddings)
44
+ else:
45
+ search_index = self.create_index(embeddings)
46
+ print("Created index")
47
+ return search_index
48
+
49
+ def load_index(self, embeddings):
50
+ # Load index
51
+ db = FAISS.load_local(
52
+ folder_path="vector_stores/",
53
+ index_name="canvas-discussions", embeddings=embeddings,
54
+ )
55
+ print("Loaded index")
56
+ return db
57
+
58
+ def create_index(self, embeddings):
59
+ source_chunks = self.create_chunk_documents()
60
+ search_index = search_index_from_docs(source_chunks, embeddings)
61
+ FAISS.save_local(search_index, folder_path="vector_stores/", index_name="canvas-discussions")
62
+ return search_index
63
+
64
+ def create_chunk_documents(self):
65
+ sources = self.fetch_data_for_embeddings()
66
+
67
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
68
+
69
+ source_chunks = splitter.split_documents(sources)
70
+
71
+ print("chunks: " + str(len(source_chunks)))
72
+ print("sources: " + str(len(sources)))
73
+
74
+ return source_chunks
75
+
76
+ def fetch_data_for_embeddings(self):
77
+ document_list = self.get_csv_files()
78
+ print("document list: " + str(len(document_list)))
79
+ return document_list
80
+
81
+ def get_csv_files(self):
82
+ loader = CSVLoader(file_path=self.grader.csv, source_column="student_name")
83
+ document_list = loader.load()
84
+ return document_list
85
+
86
+ def create_chain(self, embeddings):
87
+ if not self.search_index:
88
+ self.search_index = self.load_index(embeddings)
89
+ chain = ConversationalRetrievalChain.from_llm(self.llm, self.search_index.as_retriever(search_type='mmr',
90
+ search_kwargs={'lambda_mult': 1,
91
+ 'fetch_k': 50,
92
+ 'k': 30}),
93
+ return_source_documents=True,
94
+ verbose=True,
95
+ memory=ConversationSummaryBufferMemory(memory_key='chat_history',
96
+ llm=self.llm,
97
+ max_token_limit=40,
98
+ return_messages=True,
99
+ output_key='answer'),
100
+ get_chat_history=get_chat_history,
101
+ combine_docs_chain_kwargs={"prompt": self.create_prompt()})
102
+ return chain
103
+
104
+ def create_prompt(self):
105
+ system_template = f"""You are Canvas Discussions Grading + Feedback QA Bot. Have a conversation with a human, answering the following questions as best you can.
106
+ You are a grading assistant who graded the canvas discussions to create the following grading results and feedback. Use the following pieces of the grading results and feedback to answer the users question.
107
+ Use the following pieces of context to answer the users question.
108
+ ----------------
109
+ {self.rubric_text}
110
+ ----------------
111
+ {{context}}"""
112
+ messages = [
113
+ SystemMessagePromptTemplate.from_template(system_template),
114
+ HumanMessagePromptTemplate.from_template("{question}"),
115
+ ]
116
+ return ChatPromptTemplate.from_messages(messages)
117
+
118
+ def get_tokens(self):
119
+ total_tokens = 0
120
+ for doc in self.docs:
121
+ chat_prompt = self.prompt.format(context=doc, question=self.question)
122
+
123
+ num_tokens = self.llm.get_num_tokens(chat_prompt)
124
+ total_tokens += num_tokens
125
+
126
+ # summary = self.llm(summary_prompt)
127
+
128
+ # print (f"Summary: {summary.strip()}")
129
+ # print ("\n")
130
+ return total_tokens
131
+
132
+ def run_qa_chain(self, question):
133
+ self.question = question
134
+ self.get_tokens()
135
+ answer = self.chain(question)
136
+ return answer
137
+
138
+ # system_template = """You are Canvas Discussions Grading + Feedback QA Bot. Have a conversation with a human, answering the following questions as best you can.
139
+ # You are a grading assistant who graded the canvas discussions to create the following grading results and feedback. Use the following pieces of the grading results and feedback to answer the users question.
140
+ # Use the following pieces of context to answer the users question.
141
+ # ----------------
142
+ # {context}"""
143
+ #
144
+ # messages = [
145
+ # SystemMessagePromptTemplate.from_template(system_template),
146
+ # HumanMessagePromptTemplate.from_template("{question}"),
147
+ # ]
148
+ # CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
149
+ #
150
+ #
151
+ # def get_search_index(embeddings):
152
+ # global vectorstore_index
153
+ # if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(pickle_file) > 0:
154
+ # # Load index from pickle file
155
+ # search_index = load_index(embeddings)
156
+ # else:
157
+ # search_index = create_index(model)
158
+ # print("Created index")
159
+ #
160
+ # vectorstore_index = search_index
161
+ # return search_index
162
+ #
163
+ #
164
+ # def create_index(embeddings):
165
+ # source_chunks = create_chunk_documents()
166
+ # search_index = search_index_from_docs(source_chunks, embeddings)
167
+ # # search_index.persist()
168
+ # FAISS.save_local(search_index, folder_path="vector_stores/", index_name="canvas-discussions")
169
+ # # Save index to pickle file
170
+ # # with open(pickle_file, "wb") as f:
171
+ # # pickle.dump(search_index, f)
172
+ # return search_index
173
+ #
174
+ #
175
+ # def search_index_from_docs(source_chunks, embeddings):
176
+ # # print("source chunks: " + str(len(source_chunks)))
177
+ # # print("embeddings: " + str(embeddings))
178
+ # search_index = FAISS.from_documents(source_chunks, embeddings)
179
+ # return search_index
180
+ #
181
+ #
182
+ # def get_html_files():
183
+ # loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
184
+ # document_list = loader.load()
185
+ # for document in document_list:
186
+ # document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
187
+ # return document_list
188
+ #
189
+ #
190
+ # def get_text_files():
191
+ # loader = DirectoryLoader('docs', glob="**/*.txt", loader_cls=TextLoader, recursive=True)
192
+ # document_list = loader.load()
193
+ # return document_list
194
+ #
195
+ #
196
+ # def create_chunk_documents():
197
+ # sources = fetch_data_for_embeddings()
198
+ #
199
+ # splitter = RecursiveCharacterTextSplitter.from_language(
200
+ # language=Language.HTML, chunk_size=500, chunk_overlap=0
201
+ # )
202
+ #
203
+ # source_chunks = splitter.split_documents(sources)
204
+ #
205
+ # print("chunks: " + str(len(source_chunks)))
206
+ # print("sources: " + str(len(sources)))
207
+ #
208
+ # return source_chunks
209
+ #
210
+ #
211
+ # def create_chain(question, llm, embeddings):
212
+ # db = load_index(embeddings)
213
+ #
214
+ # # Create chain
215
+ # chain = ConversationalRetrievalChain.from_llm(llm, db.as_retriever(search_type='mmr',
216
+ # search_kwargs={'lambda_mult': 1, 'fetch_k': 50,
217
+ # 'k': 30}),
218
+ # return_source_documents=True,
219
+ # verbose=True,
220
+ # memory=ConversationSummaryBufferMemory(memory_key='chat_history',
221
+ # llm=llm, max_token_limit=40,
222
+ # return_messages=True,
223
+ # output_key='answer'),
224
+ # get_chat_history=get_chat_history,
225
+ # combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
226
+ #
227
+ # result = chain({"question": question})
228
+ #
229
+ # sources = []
230
+ # print(result)
231
+ #
232
+ # for document in result['source_documents']:
233
+ # sources.append("\n" + str(document.metadata))
234
+ # print(sources)
235
+ #
236
+ # source = ',\n'.join(set(sources))
237
+ # return result['answer'] + '\nSOURCES: ' + source
238
+ #
239
+ #
240
+ # def load_index(embeddings):
241
+ # # Load index
242
+ # db = FAISS.load_local(
243
+ # folder_path="vector_stores/",
244
+ # index_name="canvas-discussions", embeddings=embeddings,
245
+ # )
246
+ # return db
247
+ #
248
+ #
249
+ # def get_chat_history(inputs) -> str:
250
+ # res = []
251
+ # for human, ai in inputs:
252
+ # res.append(f"Human:{human}\nAI:{ai}")
253
+ # return "\n".join(res)