rohan13 commited on
Commit
cc93546
Β·
1 Parent(s): ab0e4d5

Grading changes

Browse files
Files changed (12) hide show
  1. .gitattributes +1 -0
  2. app.py +96 -0
  3. custom_faiss.py +125 -0
  4. discussion.py +50 -0
  5. discussion_.py +110 -0
  6. discussion_1.py +39 -0
  7. main.py +11 -0
  8. models/openai_vs.index +3 -0
  9. models/openai_vs.pkl +0 -0
  10. requirements.txt +13 -0
  11. schema.py +30 -0
  12. utils.py +515 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.index filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from main import index, run
3
+ from gtts import gTTS
4
+ import os, time
5
+
6
+ from transformers import pipeline
7
+
8
+ p = pipeline("automatic-speech-recognition")
9
+
10
+ """Use text to call chat method from main.py"""
11
+
12
+ def add_text(history, text):
13
+ print("Question asked: " + text)
14
+ response = run_model(text)
15
+ history = history + [(text, response)]
16
+ print(history)
17
+ return history, ""
18
+
19
+
20
+ def run_model(text):
21
+ start_time = time.time()
22
+ print("start time:" + str(start_time))
23
+ response = run(text)
24
+ end_time = time.time()
25
+ # If response contains string `SOURCES:`, then add a \n before `SOURCES`
26
+ if "SOURCES:" in response:
27
+ response = response.replace("SOURCES:", "\nSOURCES:")
28
+ # response = response + "\n\n" + "Time taken: " + str(end_time - start_time)
29
+ print(response)
30
+ print("Time taken: " + str(end_time - start_time))
31
+ return response
32
+
33
+
34
+
35
+ def get_output(history, audio):
36
+
37
+ txt = p(audio)["text"]
38
+ # history.append(( (audio, ) , txt))
39
+ audio_path = 'response.wav'
40
+ response = run_model(txt)
41
+ # Remove all text from SOURCES: to the end of the string
42
+ trimmed_response = response.split("SOURCES:")[0]
43
+ myobj = gTTS(text=trimmed_response, lang='en', slow=False)
44
+ myobj.save(audio_path)
45
+ # split audio by / and keep the last element
46
+ # audio = audio.split("/")[-1]
47
+ # audio = audio + ".wav"
48
+ history.append(( (audio, ) , (audio_path, )))
49
+ print(history)
50
+ return history
51
+
52
+ def set_model(history):
53
+ history = get_first_message(history)
54
+ index()
55
+ return history
56
+
57
+
58
+ def get_first_message(history):
59
+ history = [(None,
60
+ 'Get your canvas disucssion graded. Add your discussion url and get your discussions graded in instantly.')]
61
+ return history
62
+
63
+
64
+ def bot(history):
65
+ return history
66
+
67
+ with gr.Blocks() as demo:
68
+
69
+ chatbot = gr.Chatbot(get_first_message([]), elem_id="chatbot").style(height=600)
70
+
71
+ with gr.Row():
72
+ with gr.Column(scale=0.75):
73
+ txt = gr.Textbox(
74
+ label="8 Nous Grading Bot",
75
+ placeholder="Enter text and press enter, or upload an image", lines=1
76
+ ).style(container=False)
77
+
78
+ with gr.Column(scale=0.25):
79
+ audio = gr.Audio(source="microphone", type="filepath").style(container=False)
80
+
81
+ txt.submit(add_text, [chatbot, txt], [chatbot, txt], postprocess=False).then(
82
+ bot, chatbot, chatbot
83
+ )
84
+
85
+ audio.change(fn=get_output, inputs=[chatbot, audio], outputs=[chatbot]).then(
86
+ bot, chatbot, chatbot
87
+ )
88
+
89
+ audio.change(lambda:None, None, audio)
90
+
91
+ set_model(chatbot)
92
+
93
+ if __name__ == "__main__":
94
+ demo.queue()
95
+ demo.queue(concurrency_count=5)
96
+ demo.launch(debug=True)
custom_faiss.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import FAISS
2
+ import math
3
+ import os
4
+ import pickle
5
+ import uuid
6
+ from pathlib import Path
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
8
+
9
+ import numpy as np
10
+
11
+ from langchain.docstore.base import AddableMixin, Docstore
12
+ from langchain.docstore.document import Document
13
+ from langchain.docstore.in_memory import InMemoryDocstore
14
+ from langchain.embeddings.base import Embeddings
15
+ from langchain.vectorstores.base import VectorStore
16
+ from langchain.vectorstores.utils import maximal_marginal_relevance
17
+
18
+
19
+ class MyFAISS(FAISS):
20
+
21
+ def max_marginal_relevance_search_by_vector(
22
+ self,
23
+ embedding: List[float],
24
+ k: int = 4,
25
+ fetch_k: int = 20,
26
+ lambda_mult: float = 0.5,
27
+ filter: Optional[Dict[str, Any]] = None,
28
+ **kwargs: Any,
29
+ ) -> List[Document]:
30
+ """Return docs selected using the maximal marginal relevance.
31
+
32
+ Maximal marginal relevance optimizes for similarity to query AND diversity
33
+ among selected documents.
34
+
35
+ Args:
36
+ embedding: Embedding to look up documents similar to.
37
+ k: Number of Documents to return. Defaults to 4.
38
+ fetch_k: Number of Documents to fetch before filtering to
39
+ pass to MMR algorithm.
40
+ lambda_mult: Number between 0 and 1 that determines the degree
41
+ of diversity among the results with 0 corresponding
42
+ to maximum diversity and 1 to minimum diversity.
43
+ Defaults to 0.5.
44
+ Returns:
45
+ List of Documents selected by maximal marginal relevance.
46
+ """
47
+ _, indices = self.index.search(
48
+ np.array([embedding], dtype=np.float32),
49
+ fetch_k if filter is None else fetch_k * 2,
50
+ )
51
+ if filter is not None:
52
+ filtered_indices = []
53
+ for i in indices[0]:
54
+ if i == -1:
55
+ # This happens when not enough docs are returned.
56
+ continue
57
+ _id = self.index_to_docstore_id[i]
58
+ doc = self.docstore.search(_id)
59
+ if not isinstance(doc, Document):
60
+ raise ValueError(f"Could not find document for id {_id}, got {doc}")
61
+
62
+ print("metadata: " + str(doc.metadata))
63
+ print("filter: " + str(filter))
64
+ if any(filter_word in doc.metadata.get(key, '') for key, value in filter.items() for filter_word in
65
+ value.split()):
66
+ filtered_indices.append(i)
67
+ indices = np.array([filtered_indices])
68
+ # -1 happens when not enough docs are returned.
69
+ embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
70
+ mmr_selected = maximal_marginal_relevance(
71
+ np.array([embedding], dtype=np.float32),
72
+ embeddings,
73
+ k=k,
74
+ lambda_mult=lambda_mult,
75
+ )
76
+ selected_indices = [indices[0][i] for i in mmr_selected]
77
+ docs = []
78
+ for i in selected_indices:
79
+ if i == -1:
80
+ # This happens when not enough docs are returned.
81
+ continue
82
+ _id = self.index_to_docstore_id[i]
83
+ doc = self.docstore.search(_id)
84
+ if not isinstance(doc, Document):
85
+ raise ValueError(f"Could not find document for id {_id}, got {doc}")
86
+ docs.append(doc)
87
+ return docs
88
+
89
+ def max_marginal_relevance_search(
90
+ self,
91
+ query: str,
92
+ k: int = 4,
93
+ fetch_k: int = 20,
94
+ lambda_mult: float = 0.5,
95
+ filter: Optional[Dict[str, Any]] = None,
96
+ **kwargs: Any,
97
+ ) -> List[Document]:
98
+ """Return docs selected using the maximal marginal relevance.
99
+
100
+ Maximal marginal relevance optimizes for similarity to query AND diversity
101
+ among selected documents.
102
+
103
+ Args:
104
+ query: Text to look up documents similar to.
105
+ k: Number of Documents to return. Defaults to 4.
106
+ fetch_k: Number of Documents to fetch before filtering (if needed) to
107
+ pass to MMR algorithm.
108
+ lambda_mult: Number between 0 and 1 that determines the degree
109
+ of diversity among the results with 0 corresponding
110
+ to maximum diversity and 1 to minimum diversity.
111
+ Defaults to 0.5.
112
+ Returns:
113
+ List of Documents selected by maximal marginal relevance.
114
+ """
115
+ print("MMR search")
116
+ embedding = self.embedding_function(query)
117
+ docs = self.max_marginal_relevance_search_by_vector(
118
+ embedding,
119
+ k,
120
+ fetch_k,
121
+ lambda_mult=lambda_mult,
122
+ filter=filter,
123
+ **kwargs,
124
+ )
125
+ return docs
discussion.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ # Replace these variables with your own information
5
+ access_token = 'YOUR_ACCESS_TOKEN'
6
+ course_id = '36263'
7
+ discussion_topic_id = '421517'
8
+ base_url = 'https://canvas.illinois.edu'
9
+
10
+ headers = {
11
+ 'Authorization': f'Bearer {access_token}'
12
+ }
13
+
14
+ # Create a content export
15
+ export_url = f'{base_url}/api/v1/courses/{course_id}/content_exports'
16
+ export_params = {
17
+ 'export_type': 'common_cartridge',
18
+ 'skip_notifications': True,
19
+ 'select': {
20
+ 'discussion_topics': [discussion_topic_id]
21
+ }
22
+ }
23
+
24
+ export_response = requests.post(export_url, headers=headers, params=export_params)
25
+
26
+ if export_response.ok:
27
+ export_data = export_response.json()
28
+ export_id = export_data['id']
29
+
30
+ # Check the progress of the content export
31
+ progress_url = f'{base_url}/api/v1/progress/{export_id}'
32
+ progress_response = requests.get(progress_url, headers=headers)
33
+
34
+ if progress_response.ok:
35
+ progress_data = progress_response.json()
36
+ while progress_data['workflow_state'] not in ['completed', 'failed']:
37
+ progress_response = requests.get(progress_url, headers=headers)
38
+ progress_data = progress_response.json()
39
+
40
+ if progress_data['workflow_state'] == 'completed':
41
+ # Download the exported content
42
+ download_url = progress_data['url']
43
+ download_response = requests.get(download_url)
44
+
45
+ if download_response.ok:
46
+ # Save the exported content to a file
47
+ with open('discussion_topic_export.imscc', 'wb') as f:
48
+ f.write(download_response.content)
49
+ else:
50
+ print(f'Error: {export_response.text}')
discussion_.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import os
4
+ from typing import List
5
+
6
+ class DiscussionEntry:
7
+ def __init__(self, id: int, parent_id: int, name: str, message: str, replies: List):
8
+ self.id = id
9
+ self.parent_id = parent_id
10
+ self.name = name
11
+ self.message = message
12
+ self.replies = replies
13
+
14
+ def to_json(self):
15
+ return {
16
+ 'id': self.id,
17
+ 'parent_id': self.parent_id,
18
+ 'name': self.name,
19
+ 'message': self.message,
20
+ 'replies': [reply.to_json() for reply in self.replies]
21
+ }
22
+
23
+ def extract_entries(entries, participants):
24
+ result = []
25
+ for entry in entries:
26
+ if 'message' in entry and 'deleted' not in entry:
27
+ id = entry['id']
28
+ parent_id = entry['parent_id']
29
+ user_id = entry['user_id']
30
+ name = next((p['display_name'] for p in participants if p['id'] == user_id), None)
31
+ message = entry['message']
32
+ replies = []
33
+ if 'replies' in entry:
34
+ replies = extract_entries(entry['replies'], participants)
35
+ result.append(DiscussionEntry(id, parent_id, name, message, replies))
36
+ return result
37
+
38
+ def save_messages(entries):
39
+
40
+ for entry in entries:
41
+ # Save the message as an HTML file
42
+ filename = f'docs/{entry.name}.html'
43
+
44
+ # Open file in write/append mode
45
+ with open(filename, 'a+') as f:
46
+ if entry.parent_id == None:
47
+ f.write(f'<p><b>Student Post: {entry.name}</b></p>')
48
+ f.write(entry.message)
49
+ f.write('<hr>')
50
+ else:
51
+ f.write(f'<p><b>Reply to: {entry.parent_id}</b></p>')
52
+ f.write(entry.message)
53
+ f.write('<hr>')
54
+
55
+
56
+ # Save the messages of the replies
57
+ for entry in entries:
58
+
59
+ save_messages(entry.replies)
60
+
61
+ # Replace these variables with your own information
62
+ access_token = ''
63
+ course_id = '36263'
64
+ discussion_topic_id = '421517'
65
+ base_url = 'https://canvas.illinois.edu'
66
+
67
+ headers = {
68
+ 'Authorization': f'Bearer {access_token}'
69
+ }
70
+
71
+ # Retrieve the full discussion topic data
72
+ discussion_url = f'{base_url}/api/v1/courses/{course_id}/discussion_topics/{discussion_topic_id}/view'
73
+ discussion_response = requests.get(discussion_url, headers=headers)
74
+
75
+ if discussion_response.ok:
76
+ discussion_data = discussion_response.json()
77
+ with open('discussion_data.json', 'w') as f:
78
+ json.dump(discussion_data, f)
79
+
80
+ # Extract the desired fields from the replies and responses
81
+ entries = extract_entries(discussion_data['view'], discussion_data['participants'])
82
+
83
+ # Save the extracted data to a file
84
+ with open('discussion_entries.json', 'w') as f:
85
+ json.dump([entry.to_json() for entry in entries], f)
86
+
87
+ # Create the /docs directory if it does not exist
88
+ os.makedirs('docs', exist_ok=True)
89
+
90
+ # Save the messages as HTML files under the /docs directory
91
+ save_messages(entries)
92
+
93
+ # Extract the rubric and save it to a file
94
+ if 'rubric' in discussion_data:
95
+ rubric = discussion_data['rubric']
96
+ with open('rubric.json', 'w') as f:
97
+ json.dump(rubric, f)
98
+ else:
99
+ print(f'Error: {discussion_response.text}')
100
+
101
+ rubric_url = f'{base_url}/api/v1/courses/{course_id}/discussion_topics/{discussion_topic_id}'
102
+ rubric_response = requests.get(rubric_url, headers=headers)
103
+
104
+ if rubric_response.ok:
105
+ rubric_data = rubric_response.json()
106
+ # print(rubric_data)
107
+ if 'rubric' in rubric_data['assignment']:
108
+ rubric = rubric_data['assignment']['rubric']
109
+ with open('rubric_data.json', 'w') as f:
110
+ json.dump(rubric, f)
discussion_1.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ # Replace these variables with your own information
5
+ access_token = ''
6
+ course_id = '36263'
7
+ discussion_topic_id = '421517'
8
+ base_url = 'https://canvas.illinois.edu'
9
+
10
+ headers = {
11
+ 'Authorization': f'Bearer {access_token}'
12
+ }
13
+
14
+ # Retrieve the full discussion topic data
15
+ discussion_url = f'{base_url}/api/v1/courses/{course_id}/discussion_topics/{discussion_topic_id}/view'
16
+ discussion_response = requests.get(discussion_url, headers=headers)
17
+
18
+ if discussion_response.ok:
19
+ discussion_data = discussion_response.json()
20
+
21
+ with open('discussion_data.json', 'w') as f:
22
+ json.dump(discussion_data, f)
23
+
24
+ # Extract the replies and responses
25
+ discussions = []
26
+ replies = []
27
+ for entry in discussion_data['view']:
28
+ discussions.extend(entry)
29
+ if 'replies' in entry:
30
+ replies.extend(entry['replies'])
31
+
32
+ with open('discussions.json', 'w') as f:
33
+ json.dump(discussions, f)
34
+
35
+ # Save the replies and responses to a file
36
+ with open('discussion_replies.json', 'w') as f:
37
+ json.dump(replies, f)
38
+ else:
39
+ print(f'Error: {discussion_response.text}')
main.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import get_search_index, generate_answer, set_model_and_embeddings, get_question_type
2
+
3
+ def index():
4
+ set_model_and_embeddings()
5
+ get_search_index()
6
+ return True
7
+
8
+ def run(question):
9
+ index()
10
+ # return generate_answer(question)
11
+ return get_question_type(question)
models/openai_vs.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76c918e13944c7b3a409671e5d5ec4f94a4b260b82bd197b2bc3a39c433e1f9d
3
+ size 196653
models/openai_vs.pkl ADDED
Binary file (277 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ faiss-cpu==1.7.3
4
+ unstructured==0.5.8
5
+ ffmpeg-python
6
+ transformers
7
+ gtts
8
+ torch
9
+ tiktoken
10
+ huggingface-hub
11
+ google-generativeai
12
+ gradio
13
+ jq
schema.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List, Optional
3
+
4
+ class ForumUser(BaseModel):
5
+ id: int
6
+ anonymous_id: str
7
+ display_name: str
8
+ avatar_image_url: str
9
+ html_url: str
10
+ pronouns: Optional[str]
11
+
12
+ class ForumPost(BaseModel):
13
+ id: int
14
+ user_id: int
15
+ parent_id: Optional[int]
16
+ created_at: str
17
+ updated_at: str
18
+ rating_count: Optional[int]
19
+ rating_sum: Optional[int]
20
+ user_name: str
21
+ message: str
22
+ user: ForumUser
23
+ read_state: str
24
+ forced_read_state: bool
25
+
26
+ def get_data_from_json(file_path):
27
+ with open(file_path, "r") as f:
28
+ json_data = json.load(f)
29
+ data = [ForumPost(**item) for item in json_data]
30
+ return data
utils.py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import langchain
4
+
5
+ import faiss
6
+ from langchain import HuggingFaceHub, PromptTemplate
7
+ from langchain.chains import ConversationalRetrievalChain, LLMChain
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.llms import OpenAI
10
+ from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
11
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
12
+ from langchain.memory import ConversationBufferWindowMemory
13
+ from langchain.prompts.chat import (
14
+ ChatPromptTemplate,
15
+ HumanMessagePromptTemplate,
16
+ SystemMessagePromptTemplate,
17
+ StringPromptTemplate
18
+ )
19
+ from langchain.output_parsers import PydanticOutputParser
20
+ from langchain.tools.json.tool import JsonSpec
21
+
22
+ from typing import List, Union, Callable
23
+ from langchain.schema import AgentAction, AgentFinish
24
+ import re
25
+ from langchain.text_splitter import CharacterTextSplitter
26
+ from custom_faiss import MyFAISS
27
+ from langchain.cache import InMemoryCache
28
+ from langchain.chat_models import ChatGooglePalm
29
+ from langchain.document_loaders import JSONLoader
30
+ from langchain.agents import initialize_agent, Tool, AgentType
31
+ from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser, BaseMultiActionAgent
32
+ from langchain.tools import StructuredTool
33
+ from langchain.chains import create_tagging_chain
34
+ from typing import List, Tuple, Any, Union
35
+ from langchain.schema import AgentAction, AgentFinish
36
+ from pydantic import BaseModel, Field
37
+ from typing import Optional
38
+
39
+ class ToolArgsSchema(BaseModel):
40
+ student_name: Optional[str] = Field(description="The name of the student")
41
+ question: str = Field(description="The question being asked")
42
+ question_type: str = Field(description="The type of question being asked")
43
+ interest: Optional[str] = Field(description="The interest of the student")
44
+
45
+ class Config:
46
+ schema_extra = {
47
+ "required": ["question", "question_type"]
48
+ }
49
+
50
+
51
+
52
+
53
+
54
+ langchain.llm_cache = InMemoryCache()
55
+
56
+ model_name = "GPT-4"
57
+
58
+ pickle_file = "_vs.pkl"
59
+ index_file = "_vs.index"
60
+ models_folder = "models/"
61
+ os.environ["LANGCHAIN_TRACING"] = "true"
62
+ discussions_file_path = "discussion_entries.json"
63
+
64
+ llm = OpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, verbose=True)
65
+
66
+ embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
67
+
68
+ chat_history = []
69
+
70
+ memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10)
71
+
72
+ vectorstore_index = None
73
+
74
+ agent_prompt = """
75
+ I am the LLM AI canvas discussion grading assistant.
76
+ I can answer two types of questions: grade-based questions and interest-based questions.
77
+ Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns.
78
+ Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
79
+ You have access to the following tools:
80
+
81
+ {tools}
82
+
83
+ Use the following format:
84
+
85
+ Question: the input question you must answer
86
+ Thought: you should always think about type of question it is
87
+ Action: the action to take, should be one of [{tool_names}]
88
+ Action Input: the input to the action
89
+ Observation: the result of the action
90
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
91
+ Thought: I now know the final answer
92
+ Final Answer: the final answer to the original input question
93
+
94
+ Begin!
95
+
96
+ Question: {input}
97
+ {agent_scratchpad}
98
+ """
99
+
100
+ # Set up a prompt template
101
+ class CustomPromptTemplate(StringPromptTemplate):
102
+ # The template to use
103
+ template: str
104
+ ############## NEW ######################
105
+ # The list of tools available
106
+ tools_getter: Callable
107
+
108
+ def format(self, **kwargs) -> str:
109
+ # Get the intermediate steps (AgentAction, Observation tuples)
110
+ # Format them in a particular way
111
+ intermediate_steps = kwargs.pop("intermediate_steps")
112
+ thoughts = ""
113
+ for action, observation in intermediate_steps:
114
+ thoughts += action.log
115
+ thoughts += f"\nObservation: {observation}\nThought: "
116
+ # Set the agent_scratchpad variable to that value
117
+ kwargs["agent_scratchpad"] = thoughts
118
+ ############## NEW ######################
119
+ tools = self.tools_getter(kwargs["input"])
120
+ # Create a tools variable from the list of tools provided
121
+ kwargs["tools"] = "\n".join(
122
+ [f"{tool.name}: {tool.description}" for tool in tools]
123
+ )
124
+ # Create a list of tool names for the tools provided
125
+ kwargs["tool_names"] = ", ".join([tool.name for tool in tools])
126
+ return self.template.format(**kwargs)
127
+
128
+ class CustomOutputParser(AgentOutputParser):
129
+
130
+ def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
131
+ print("llm_output")
132
+ print(llm_output)
133
+ # Check if agent should finish
134
+ if "Final Answer:" in llm_output:
135
+ return AgentFinish(
136
+ # Return values is generally always a dictionary with a single `output` key
137
+ # It is not recommended to try anything else at the moment :)
138
+ return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
139
+ log=llm_output,
140
+ )
141
+ # Parse out the action and action input
142
+ regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
143
+ match = re.search(regex, llm_output, re.DOTALL)
144
+ if not match:
145
+ raise ValueError(f"Could not parse LLM output: `{llm_output}`")
146
+ action = match.group(1).strip()
147
+ action_input = match.group(2)
148
+ # Return the action and action input
149
+ return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
150
+
151
+ system_template = """
152
+ I am the LLM AI canvas discussion grading assistant.
153
+ I can answer two types of questions: grade-based questions and interest-based questions.
154
+ Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns.
155
+ Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
156
+ To grade student discussions, I will follow the rubric below.
157
+
158
+ Student Post
159
+
160
+ 3 points: Post includes 8 nouns and text describing how these nouns relate to the student.
161
+ 2 points: Student's post includes 8 nouns but does not offer how those nouns relate to the student.
162
+ 1 point: Student's post has significant missing details.
163
+ 0 points: The student does not provide an initial post, or otherwise does not follow assignment instructions.
164
+
165
+
166
+ Response to Others
167
+
168
+ 3 points: Student responds to at least 3 other student discussion threads AND responds to questions asked of them. Student posts insightful comments that prompt on target discussion. These posts also avoid throw away comments such as I agree, Me too, Good idea.
169
+ 2 points: Student was notably lacking in one criterion.
170
+ 1 point: Student was notably lacking in two criteria.
171
+ 0 points: The student does not interact in the threads of other students.
172
+ I will be able to identify each student by name, and I will be able to share their likings, interests, and other characteristics. I will also be able to filter out students based on their interests.
173
+
174
+ I will not deviate from the grading scheme. I will grade each discussion entry and reply carefully, and I will share the grades of all individuals by name on the basis of the rubric with final score.
175
+
176
+ The discussions and their replies are in following format:
177
+ Student Post: Student Name
178
+ Reply to: Another Student Discussion ID
179
+
180
+ Following are the relevant discussions to grade or answer the interest based questions
181
+ ----------------
182
+ Discussions:
183
+ {context}"""
184
+
185
+ messages = [
186
+ SystemMessagePromptTemplate.from_template(system_template),
187
+ HumanMessagePromptTemplate.from_template("{question}"),
188
+ ]
189
+ CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
190
+
191
+
192
+ def set_model_and_embeddings():
193
+ global chat_history
194
+ # set_model(model)
195
+ # set_embeddings(model)
196
+ chat_history = []
197
+
198
+ def set_embeddings(model):
199
+ global embeddings
200
+ if model == "GPT-3.5" or model == "GPT-4":
201
+ print("Loading OpenAI embeddings")
202
+ embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
203
+ elif model == "Flan UL2" or model == "Flan T5":
204
+ print("Loading Hugging Face embeddings")
205
+ embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")
206
+
207
+
208
+ def get_search_index():
209
+ global vectorstore_index, model_name
210
+ if os.path.isfile(get_file_path(model_name, pickle_file)) and os.path.isfile(
211
+ get_file_path(model_name, index_file)) and os.path.getsize(get_file_path(model_name, pickle_file)) > 0:
212
+ # Load index from pickle file
213
+ with open(get_file_path(model_name, pickle_file), "rb") as f:
214
+ # search_index = Chroma(persist_directory=models_folder, embedding_function=embeddings)
215
+ search_index = pickle.load(f)
216
+ print("Loaded index")
217
+ else:
218
+ search_index = create_index(model_name)
219
+ print("Created index")
220
+
221
+ vectorstore_index = search_index
222
+ return search_index
223
+
224
+
225
+ def create_index(model):
226
+ source_chunks = create_chunk_documents()
227
+ search_index = search_index_from_docs(source_chunks)
228
+ # search_index.persist()
229
+ faiss.write_index(search_index.index, get_file_path(model, index_file))
230
+ # Save index to pickle file
231
+ with open(get_file_path(model, pickle_file), "wb") as f:
232
+ pickle.dump(search_index, f)
233
+ return search_index
234
+
235
+
236
+ def get_file_path(model, file):
237
+ # If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
238
+ if model == "GPT-3.5" or model == "GPT-4":
239
+ return models_folder + "openai" + file
240
+ else:
241
+ return models_folder + "hf" + file
242
+
243
+
244
+ def search_index_from_docs(source_chunks):
245
+ # print("source chunks: " + str(len(source_chunks)))
246
+ # print("embeddings: " + str(embeddings))
247
+
248
+ search_index = MyFAISS.from_documents(source_chunks, embeddings)
249
+ return search_index
250
+
251
+
252
+ def get_html_files():
253
+ loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
254
+ document_list = loader.load()
255
+ for document in document_list:
256
+ document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
257
+ return document_list
258
+
259
+ def metadata_func(record: dict, metadata: dict) -> dict:
260
+ metadata["name"] = record.get("name")
261
+ return metadata
262
+ def get_json_file():
263
+ global discussions_file_path
264
+ loader = JSONLoader(
265
+ file_path=discussions_file_path,
266
+ jq_schema='.[]', metadata_func=metadata_func, content_key="message")
267
+ return loader.load()
268
+ def fetch_data_for_embeddings():
269
+ # document_list = get_text_files()
270
+ document_list = get_html_files()
271
+ # document_list = get_json_file()
272
+ print("document list: " + str(len(document_list)))
273
+ return document_list
274
+
275
+
276
+ def get_text_files():
277
+ loader = DirectoryLoader('docs', glob="**/*.txt", loader_cls=TextLoader, recursive=True)
278
+ document_list = loader.load()
279
+ return document_list
280
+
281
+
282
+ def create_chunk_documents():
283
+ sources = fetch_data_for_embeddings()
284
+
285
+ splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)
286
+
287
+ source_chunks = splitter.split_documents(sources)
288
+
289
+ print("chunks: " + str(len(source_chunks)))
290
+
291
+ return sources
292
+
293
+
294
+ def get_qa_chain(vectorstore_index, question, metadata):
295
+ global llm, model_name
296
+ print(llm)
297
+ filter_dict = {"name": metadata.student_name}
298
+ # embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
299
+ # compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
300
+ retriever = get_retriever(filter_dict, vectorstore_index, metadata)
301
+
302
+ print(retriever.get_relevant_documents(question))
303
+
304
+ chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
305
+ verbose=True, get_chat_history=get_chat_history,
306
+ combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
307
+ return chain
308
+
309
+
310
+ def get_retriever(filter_dict, vectorstore_index, metadata):
311
+ if metadata.question_type == "grade-based":
312
+ retriever = vectorstore_index.as_retriever(search_type='mmr',
313
+ search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10,
314
+ 'filter': filter_dict})
315
+
316
+ else:
317
+ retriever = vectorstore_index.as_retriever(search_type='mmr',
318
+ search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10})
319
+
320
+ return retriever
321
+
322
+
323
+ def get_chat_history(inputs) -> str:
324
+ res = []
325
+ for human, ai in inputs:
326
+ res.append(f"Human:{human}\nAI:{ai}")
327
+ return "\n".join(res)
328
+
329
+
330
+ def generate_answer(question, metadata: ToolArgsSchema) -> str:
331
+ # print("filter: " + filter)
332
+ global chat_history, vectorstore_index
333
+ chain = get_qa_chain(vectorstore_index, question, metadata)
334
+
335
+ result = chain(
336
+ {"question": question, "chat_history": chat_history})
337
+ chat_history.extend([(question, result["answer"])])
338
+ sources = []
339
+ print(result)
340
+
341
+ for document in result['source_documents']:
342
+ source = document.metadata['source']
343
+ sources.append(source.split('/')[-1].split('.')[0])
344
+ print(sources)
345
+
346
+ source = ',\n'.join(set(sources))
347
+ # return result['answer'] + '\nSOURCES: ' + source
348
+ return result['answer']
349
+ def get_question_type(question):
350
+
351
+ parser = PydanticOutputParser(pydantic_object=ToolArgsSchema)
352
+ prompt_template = """I can answer two types of questions: grade-based questions and interest-based questions.
353
+ Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns.
354
+ Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
355
+ Question: {question}
356
+ Find following information about the question asked. Return Optional empty if the information is not available.:
357
+ Format instructions: {format_instructions}"""
358
+
359
+ llm = OpenAI(temperature=0)
360
+ prompt = PromptTemplate(template=prompt_template, input_variables=["question"], output_parser=parser, partial_variables={"format_instructions": parser.get_format_instructions()})
361
+ llm_chain = LLMChain(
362
+ llm=llm,
363
+ prompt=prompt,
364
+
365
+ )
366
+ output = llm_chain.run(question)
367
+ output = parser.parse(output)
368
+ output = generate_answer(question, output)
369
+ return output
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+ # class FakeAgent(BaseMultiActionAgent):
382
+ # """Fake Custom Agent."""
383
+ #
384
+ # @property
385
+ # def input_keys(self):
386
+ # return ["input"]
387
+ #
388
+ # def plan(
389
+ # self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
390
+ # ) -> Union[List[AgentAction], AgentFinish]:
391
+ # print("input keys")
392
+ # print(self.input_keys)
393
+ # print("intermediate steps")
394
+ # print(intermediate_steps)
395
+ # print("kwargs")
396
+ # print(kwargs)
397
+ #
398
+ # """Given input, decided what to do.
399
+ #
400
+ # Args:
401
+ # intermediate_steps: Steps the LLM has taken to date,
402
+ # along with observations
403
+ # **kwargs: User inputs.
404
+ #
405
+ # Returns:
406
+ # Action specifying what tool to use.
407
+ # """
408
+ # if len(intermediate_steps) == 0:
409
+ # first_action = AgentAction(tool="question type", tool_input=kwargs["input"], log="")
410
+ # print("first action")
411
+ # print(first_action)
412
+ # second_action = AgentAction(tool="Grade",tool_input=kwargs["input"], log="")
413
+ # print("second action")
414
+ # print(second_action)
415
+ # return [
416
+ # first_action,
417
+ # second_action,
418
+ # ]
419
+ # else:
420
+ # return AgentFinish(return_values={"output": "bar"}, log="")
421
+ #
422
+ # async def aplan(
423
+ # self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
424
+ # ) -> Union[List[AgentAction], AgentFinish]:
425
+ # """Given input, decided what to do.
426
+ #
427
+ # Args:
428
+ # intermediate_steps: Steps the LLM has taken to date,
429
+ # along with observations
430
+ # **kwargs: User inputs.
431
+ #
432
+ # Returns:
433
+ # Action specifying what tool to use.
434
+ # """
435
+ # if len(intermediate_steps) == 0:
436
+ # return [
437
+ # AgentAction(tool="question type", tool_input=kwargs["input"], log=""),
438
+ # AgentAction(tool="Grade",
439
+ # tool_input={
440
+ # "student_name": kwargs["student_name"],
441
+ # "question": kwargs["question"],
442
+ # "question_type": kwargs["question_type"],
443
+ # "interest": kwargs["interest"]
444
+ # }, log=""),
445
+ # ]
446
+ # else:
447
+ # return AgentFinish(return_values={"output": "bar"}, log="")
448
+ #
449
+ #
450
+ # schema = {
451
+ # "properties": {
452
+ # "student_name" : {"type": "string", "description": "The name of the student"},
453
+ # "question": {"type": "string", "description": "The question being asked"},
454
+ # "question type" : {"type": "string",
455
+ # "enum": ["student grades", "student specific", "interest specific"],
456
+ # "description": "The type of question being asked"},
457
+ # "interest" : {"type": "string", "description": "The interest of the student"},
458
+ # },
459
+ # "required": ["question", "question type"]
460
+ # }
461
+
462
+
463
+
464
+
465
+
466
+ # def get_tagging_chain(question)-> str:
467
+ # global schema
468
+ # chain = create_tagging_chain(schema, llm)
469
+ # first_answer = chain.run(question)
470
+ # print("first answer:")
471
+ # print(first_answer)
472
+ # return first_answer
473
+ #
474
+ #
475
+ # def get_grading_agent():
476
+ #
477
+ # tools = [
478
+ # Tool(
479
+ # name="question type",
480
+ # func=get_tagging_chain,
481
+ # description="Useful when you need to understand the type of the input."
482
+ # ),
483
+ # StructuredTool(
484
+ # name="Grade",
485
+ # func=generate_answer,
486
+ # description="Useful when you need to answer questions about students, grades, interests, etc from the context of canvas discussion posts. If the question is student specific, student name is required.",
487
+ # args_schema=ToolArgsSchema
488
+ # )
489
+ # ]
490
+ # # agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
491
+ #
492
+ # agent = FakeAgent(output_parser=CustomOutputParser())
493
+ # # prompt = CustomPromptTemplate(template=agent_prompt, tools=tools, input_variables=["input", "intermediate_steps"])
494
+ # # output_parser = CustomOutputParser()
495
+ # # tool_names = [tool.name for tool in tools]
496
+ # # llm_chain = LLMChain(llm=llm, prompt=prompt)
497
+ # # agent = LLMSingleActionAgent(
498
+ # # llm_chain=llm_chain,
499
+ # # output_parser=output_parser,
500
+ # # stop=["\nObservation:"],
501
+ # # allowed_tools=tool_names,
502
+ # # )
503
+ # agent_executor = AgentExecutor.from_agent_and_tools(
504
+ # agent=agent, tools=tools, verbose=True
505
+ # )
506
+ #
507
+ # # return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)
508
+ # return agent_executor
509
+ #
510
+ #
511
+ #
512
+ # def grade_answer(question) -> str:
513
+ # global chat_history, vectorstore_index
514
+ # agent = get_grading_agent()
515
+ # return agent.run(question)