Spaces:
Sleeping
Sleeping
Aiswarya Sankar
commited on
Commit
·
2613437
1
Parent(s):
d2e86b9
Update the way the model is chosen
Browse files
app.py
CHANGED
@@ -15,8 +15,10 @@ import os
|
|
15 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
16 |
from langchain.vectorstores import DeepLake
|
17 |
import random
|
|
|
|
|
18 |
|
19 |
-
os.environ['OPENAI_API_KEY']='sk-
|
20 |
os.environ['ACTIVELOOP_TOKEN']='eyJhbGciOiJIUzUxMiIsImlhdCI6MTY4MTU5NTgyOCwiZXhwIjoxNzEzMjE4MTU5fQ.eyJpZCI6ImFpc3dhcnlhcyJ9.eoiMFZsS20zzMXXupFbowUlLdgIgf_MA1ck_DByzREeoQvNm8GPhKEfqea2y1Qak-ud2jo9dhSTBTfRe1ztezw'
|
21 |
|
22 |
|
@@ -25,15 +27,13 @@ from langchain.document_loaders import TextLoader
|
|
25 |
from langchain.text_splitter import CharacterTextSplitter
|
26 |
|
27 |
import subprocess
|
28 |
-
repo_name = "https://github.com/aiswaryasankar/memeAI.git"
|
29 |
|
30 |
from langchain.callbacks.base import BaseCallbackHandler
|
31 |
from langchain.schema import LLMResult
|
32 |
from typing import Any, Union
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
job_done = object() # signals the processing is done
|
37 |
|
38 |
class StreamingGradioCallbackHandler(BaseCallbackHandler):
|
39 |
def __init__(self, q: SimpleQueue):
|
@@ -80,8 +80,15 @@ class GithubResponse(BaseModel):
|
|
80 |
repo: str
|
81 |
|
82 |
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
|
|
85 |
|
86 |
def git_clone(repo_url):
|
87 |
subprocess.run(["git", "clone", repo_url])
|
@@ -91,46 +98,89 @@ def git_clone(repo_url):
|
|
91 |
return dirpath
|
92 |
|
93 |
|
94 |
-
def index_repo(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
pathName = git_clone(repo)
|
96 |
root_dir = './' + pathName
|
97 |
|
98 |
-
|
99 |
-
for dirpath, dirnames, filenames in os.walk(root_dir):
|
100 |
-
for file in filenames:
|
101 |
-
try:
|
102 |
-
loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
|
103 |
-
docs.extend(loader.load_and_split())
|
104 |
-
except Exception as e:
|
105 |
-
print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
|
106 |
-
pass
|
107 |
-
|
108 |
activeloop_username = "aiswaryas"
|
109 |
dataset_path = f"hub://{activeloop_username}/" + pathName
|
110 |
-
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
111 |
-
texts = text_splitter.split_documents(docs)
|
112 |
-
|
113 |
-
print(texts)
|
114 |
-
for text in texts:
|
115 |
-
print(text)
|
116 |
|
117 |
try:
|
118 |
db = DeepLake(dataset_path=dataset_path,
|
119 |
-
|
120 |
-
|
|
|
|
|
121 |
# NOTE: read_only=False because we want to ingest documents
|
122 |
# NOTE: This will raise a `deeplake.util.exceptions.LockedException` if dataset is already locked
|
123 |
# NOTE: change it to read_only=True when querying the dataset
|
124 |
|
125 |
-
#
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
except Exception as e:
|
136 |
return Response(
|
@@ -140,29 +190,35 @@ def index_repo(repo: str) -> Response:
|
|
140 |
stdout="",
|
141 |
)
|
142 |
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
repo="",
|
150 |
-
error=str(e),
|
151 |
-
stdout="",
|
152 |
-
)
|
153 |
|
154 |
-
|
155 |
-
db.ds._unlock()
|
156 |
|
157 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
|
160 |
def answer_questions(question: str, github: str, **kwargs) -> Response:
|
161 |
|
162 |
-
global
|
163 |
-
|
|
|
|
|
164 |
try:
|
165 |
-
embeddings = OpenAIEmbeddings(openai_api_key="sk-
|
166 |
pathName = github.split('/')[-1]
|
167 |
dataset_path = "hub://aiswaryas/" + pathName
|
168 |
|
@@ -185,7 +241,7 @@ def answer_questions(question: str, github: str, **kwargs) -> Response:
|
|
185 |
callback_manager=CallbackManager(
|
186 |
[StreamingGradioCallbackHandler(q)]
|
187 |
),
|
188 |
-
openai_api_key="sk-
|
189 |
)
|
190 |
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
|
191 |
chat_history = []
|
@@ -207,6 +263,7 @@ def answer_questions(question: str, github: str, **kwargs) -> Response:
|
|
207 |
stdout="",
|
208 |
)
|
209 |
|
|
|
210 |
def fetchGithubIssues(repo: str, num_issues:int, **kwargs) -> Response:
|
211 |
"""
|
212 |
This endpoint should get a list of all the github issues that are open for this repository
|
@@ -230,12 +287,12 @@ def fetchGithubIssues(repo: str, num_issues:int, **kwargs) -> Response:
|
|
230 |
|
231 |
batch.extend(issues.json())
|
232 |
for issue in issues.json():
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
|
240 |
print(issues_data)
|
241 |
return issues_data
|
@@ -265,97 +322,96 @@ def generateFolderNamesForRepo(repo):
|
|
265 |
|
266 |
return dirs[0]
|
267 |
|
|
|
268 |
def generateDocumentationPerFolder(dir, github):
|
269 |
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)
|
292 |
-
|
293 |
-
# print("finished indexing repo")
|
294 |
-
retriever = db.as_retriever()
|
295 |
-
retriever.search_kwargs['distance_metric'] = 'cos'
|
296 |
-
retriever.search_kwargs['fetch_k'] = 100
|
297 |
-
retriever.search_kwargs['maximal_marginal_relevance'] = True
|
298 |
-
retriever.search_kwargs['k'] = 20
|
299 |
-
|
300 |
-
# streaming_handler = kwargs.get('streaming_handler')
|
301 |
-
model = ChatOpenAI(
|
302 |
-
model_name='gpt-4',
|
303 |
-
temperature=0.0,
|
304 |
-
verbose=True,
|
305 |
-
streaming=True, # Pass `streaming=True` to make sure the client receives the data.
|
306 |
-
openai_api_key="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",
|
307 |
-
)
|
308 |
-
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
|
309 |
-
chat_history = []
|
310 |
|
311 |
-
|
312 |
-
return str(e)
|
313 |
|
314 |
-
#
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
|
|
319 |
|
320 |
-
|
321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
|
324 |
-
def generateArchitectureDiagram(folder) -> Response:
|
325 |
-
"""
|
326 |
-
This endpoint should generate a Mermaid diagram for the given input files. It will return the
|
327 |
-
"""
|
328 |
|
329 |
|
330 |
def solveGithubIssue(ticket, history) -> Response:
|
331 |
"""
|
332 |
This endpoint takes in a github issue and then queries the db for the question against the codebase.
|
333 |
"""
|
|
|
334 |
print(history)
|
335 |
-
global
|
336 |
-
github =
|
|
|
337 |
repoFolder = github.split("/")[-1]
|
338 |
body = ticket_choices[ticket]["body"]
|
339 |
title = ticket_choices[ticket]["title"]
|
340 |
question = """
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
|
347 |
q_display = """
|
348 |
-
|
349 |
-
|
350 |
|
351 |
print(question)
|
352 |
|
353 |
try:
|
354 |
-
embeddings = OpenAIEmbeddings(openai_api_key="sk-
|
355 |
pathName = github.split('/')[-1]
|
356 |
dataset_path = "hub://aiswaryas/" + pathName
|
357 |
|
358 |
-
db = DeepLake(dataset_path=dataset_path, read_only=True,
|
359 |
|
360 |
# print("finished indexing repo")
|
361 |
retriever = db.as_retriever()
|
@@ -373,34 +429,35 @@ def solveGithubIssue(ticket, history) -> Response:
|
|
373 |
callback_manager=CallbackManager(
|
374 |
[StreamingGradioCallbackHandler(q)]
|
375 |
),
|
376 |
-
openai_api_key="sk-
|
377 |
)
|
378 |
-
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
|
379 |
|
380 |
except Exception as e:
|
381 |
return [[str(e), None]]
|
382 |
|
383 |
history = [[q_display, ""]]
|
384 |
history[-1][1] = ""
|
385 |
-
|
|
|
386 |
history[-1][1] += char
|
387 |
time.sleep(0.01)
|
388 |
yield history
|
389 |
|
390 |
-
# return [[qa({"question": question, "chat_history": chat_history})["answer"], None]]
|
391 |
-
|
392 |
|
393 |
def user(message, history):
|
394 |
return "", history + [[message, None]]
|
395 |
|
396 |
|
397 |
def bot(history, **kwargs):
|
398 |
-
|
399 |
user_message = history[-1][0]
|
400 |
-
|
401 |
-
|
|
|
|
|
402 |
try:
|
403 |
-
embeddings = OpenAIEmbeddings(openai_api_key="sk-
|
404 |
pathName = github.split('/')[-1]
|
405 |
dataset_path = "hub://aiswaryas/" + pathName
|
406 |
|
@@ -422,7 +479,7 @@ def bot(history, **kwargs):
|
|
422 |
callback_manager=CallbackManager(
|
423 |
[StreamingGradioCallbackHandler(q)]
|
424 |
),
|
425 |
-
openai_api_key="sk-
|
426 |
)
|
427 |
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
|
428 |
chat_history = []
|
@@ -433,185 +490,203 @@ def bot(history, **kwargs):
|
|
433 |
|
434 |
history[-1][1] = ""
|
435 |
for char in qa({"question": user_message, "chat_history": chat_history})["answer"]:
|
436 |
-
|
437 |
-
|
438 |
|
439 |
|
440 |
with gr.Blocks() as demo:
|
441 |
|
442 |
-
gr.
|
443 |
-
# Entelligence AI
|
444 |
|
445 |
-
|
446 |
-
|
|
|
|
|
447 |
|
448 |
repoTextBox = gr.Textbox(label="Github Repository")
|
449 |
-
repo_name = "https://github.com/aiswaryasankar/memeAI.git"
|
450 |
-
# def update_state(value):
|
451 |
-
# repo_name.value = value
|
452 |
-
# return value
|
453 |
|
454 |
-
|
455 |
-
|
|
|
456 |
success_response = gr.Textbox(label="")
|
457 |
ingest_btn = gr.Button("Index repo")
|
458 |
-
ingest_btn.click(fn=index_repo, inputs=repoTextBox, outputs=success_response, api_name="index_repo")
|
459 |
|
|
|
460 |
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
|
472 |
|
473 |
-
|
474 |
-
|
475 |
|
476 |
-
|
477 |
-
|
478 |
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
ticket_titles = [ticket["title"] for ticket in tickets]
|
493 |
|
494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
bot, chatbot, chatbot
|
510 |
-
)
|
511 |
-
index += 1
|
512 |
-
clear.click(lambda: None, None, chatbot, queue=False)
|
513 |
|
514 |
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
|
|
519 |
|
520 |
-
|
521 |
-
""")
|
522 |
|
523 |
-
# docs = generateDocumentationPerFolder("overview", repo_name)
|
524 |
-
# markdown = gr.Markdown(value=docs)
|
525 |
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
print(dirNames)
|
535 |
-
buttons = [gr.Button(folder_name, onclick=button_click_callback) for folder_name in dirNames]
|
536 |
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
|
|
|
|
|
|
|
|
|
|
|
543 |
|
544 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
545 |
|
|
|
|
|
|
|
546 |
|
547 |
-
|
|
|
|
|
548 |
|
549 |
-
|
|
|
|
|
|
|
550 |
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
|
|
555 |
|
556 |
-
Finetuning code generation models directly on your enterprise code base has shown up to 10% increase in model suggestion acceptance rate.
|
557 |
-
""")
|
558 |
|
559 |
-
|
560 |
-
|
|
|
|
|
|
|
|
|
561 |
|
562 |
-
# Choose existing code base or input a new code base for finetuning -
|
563 |
-
with gr.Row():
|
564 |
-
gr.Markdown("""
|
565 |
-
If you'd like to use the current code base, click this toggle otherwise input the entire code base below.
|
566 |
-
""")
|
567 |
-
existing_repo = gr.Checkbox(value=True, label="Use existing repository")
|
568 |
-
gr.Textbox(label="Input repository", visible=False)
|
569 |
-
|
570 |
-
# Allow option to remove generated files etc
|
571 |
-
gr.Markdown("""
|
572 |
-
Finetuned model performance is highly dependent on training data quality. We have currently found that excluding the following file types improves performance. If you'd like to include them, please toggle them.
|
573 |
-
""")
|
574 |
-
file_types = gr.CheckboxGroup(choices=['.bin', '.gen', '.git', '.gz','.jpg', '.lz', '.midi', '.mpq','.png', '.tz'], label="Removed file types")
|
575 |
-
|
576 |
-
# Based on data above, we should show a field for estimated fine tuning cost
|
577 |
-
# Then we should show the chart for loss
|
578 |
-
def wandb_report(url):
|
579 |
-
iframe = f'<iframe src={url} style="border:none;height:1024px;width:100%">'
|
580 |
-
return gr.HTML(iframe)
|
581 |
-
|
582 |
-
submit_btn = gr.Button("Start Training")
|
583 |
-
with gr.Column(visible=False) as start_training:
|
584 |
-
# Include the epoch loss table
|
585 |
-
epoch_loss = gr.Dataframe(
|
586 |
-
headers=["Step", "Training Loss", "Validation Loss"],
|
587 |
-
datatype=["number", "number", "number"],
|
588 |
-
row_count=5,
|
589 |
-
col_count=(3, "fixed"),
|
590 |
-
value=[[500, 1.868200, 1.548535], [1000, 1.450100, 1.518277], [1500, 1.659000, 1.486497],
|
591 |
-
[2000, 1.364900, 1.452842], [2500, 1.406300, 1.405151], [3000, 1.276000, 1.346159]]
|
592 |
-
)
|
593 |
|
594 |
-
# After you start training you should see the Wandb report
|
595 |
-
report_url = 'https://wandb.ai/aiswaryasankar/aiswarya-santacoder-finetuning/reports/Aiswarya-Santacoder-Finetuning--Vmlldzo0ODM3MDA4'
|
596 |
-
report = wandb_report(report_url)
|
597 |
-
|
598 |
-
# Include a playground to compare different models on given tasks
|
599 |
-
# Link to the generated huggingface spaces model if you opt into it
|
600 |
-
# Toggle to select model for the remaining functionality
|
601 |
-
|
602 |
-
def startTraining(): # existing_repo, file_types
|
603 |
-
start_training= gr.update(visible=True)
|
604 |
-
# return {
|
605 |
-
# report: report,
|
606 |
-
# epoch_loss: epoch_loss,
|
607 |
-
# start_training: gr.update(visible=True),
|
608 |
-
# }
|
609 |
-
|
610 |
-
submit_btn.click(
|
611 |
-
startTraining,
|
612 |
-
# inputs=[existing_repo, file_types],
|
613 |
-
# outputs=[start_training], # report, epoch_loss,
|
614 |
-
)
|
615 |
-
|
616 |
-
demo.launch(debug=True)
|
617 |
|
|
|
15 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
16 |
from langchain.vectorstores import DeepLake
|
17 |
import random
|
18 |
+
import time
|
19 |
+
import together
|
20 |
|
21 |
+
os.environ['OPENAI_API_KEY']='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'
|
22 |
os.environ['ACTIVELOOP_TOKEN']='eyJhbGciOiJIUzUxMiIsImlhdCI6MTY4MTU5NTgyOCwiZXhwIjoxNzEzMjE4MTU5fQ.eyJpZCI6ImFpc3dhcnlhcyJ9.eoiMFZsS20zzMXXupFbowUlLdgIgf_MA1ck_DByzREeoQvNm8GPhKEfqea2y1Qak-ud2jo9dhSTBTfRe1ztezw'
|
23 |
|
24 |
|
|
|
27 |
from langchain.text_splitter import CharacterTextSplitter
|
28 |
|
29 |
import subprocess
|
30 |
+
# repo_name = "https://github.com/aiswaryasankar/memeAI.git"
|
31 |
|
32 |
from langchain.callbacks.base import BaseCallbackHandler
|
33 |
from langchain.schema import LLMResult
|
34 |
from typing import Any, Union
|
35 |
|
36 |
+
job_done = object()
|
|
|
|
|
37 |
|
38 |
class StreamingGradioCallbackHandler(BaseCallbackHandler):
|
39 |
def __init__(self, q: SimpleQueue):
|
|
|
80 |
repo: str
|
81 |
|
82 |
|
83 |
+
# global repoName
|
84 |
+
global ticket_titles
|
85 |
+
global tickets
|
86 |
+
global ticket_choices
|
87 |
+
tickets = []
|
88 |
+
|
89 |
+
repoName = "https://github.com/aiswaryasankar/memeAI.git"
|
90 |
|
91 |
+
embeddings = OpenAIEmbeddings(disallowed_special=())
|
92 |
|
93 |
def git_clone(repo_url):
|
94 |
subprocess.run(["git", "clone", repo_url])
|
|
|
98 |
return dirpath
|
99 |
|
100 |
|
101 |
+
def index_repo(textbox: str, dropdown: str) -> Response:
|
102 |
+
|
103 |
+
mapping = {
|
104 |
+
"Langchain" : "https://github.com/langchain-ai/langchain.git",
|
105 |
+
"Weaviate": "https://github.com/weaviate/weaviate.git",
|
106 |
+
"Llama2": "https://github.com/facebookresearch/llama.git",
|
107 |
+
"OpenAssistant": "https://github.com/LAION-AI/Open-Assistant.git",
|
108 |
+
"MemeAI": "https://github.com/aiswaryasankar/memeAI.git",
|
109 |
+
"GenerativeAgents": "https://github.com/joonspk-research/generative_agents.git"
|
110 |
+
}
|
111 |
+
|
112 |
+
# print(textbox)
|
113 |
+
# print(dropdown[0])
|
114 |
+
|
115 |
+
if textbox != "":
|
116 |
+
repo = textbox
|
117 |
+
else:
|
118 |
+
repo = mapping[dropdown[0]]
|
119 |
+
# repoName = gr.State(repo)
|
120 |
+
|
121 |
+
print("Repo name after setting the value: " + str(repoName))
|
122 |
pathName = git_clone(repo)
|
123 |
root_dir = './' + pathName
|
124 |
|
125 |
+
print("Repo name after setting the value: " + str(repoName))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
activeloop_username = "aiswaryas"
|
127 |
dataset_path = f"hub://{activeloop_username}/" + pathName
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
try:
|
130 |
db = DeepLake(dataset_path=dataset_path,
|
131 |
+
embedding_function=embeddings,
|
132 |
+
token=os.environ['ACTIVELOOP_TOKEN'],
|
133 |
+
read_only=True,
|
134 |
+
num_workers=10)
|
135 |
# NOTE: read_only=False because we want to ingest documents
|
136 |
# NOTE: This will raise a `deeplake.util.exceptions.LockedException` if dataset is already locked
|
137 |
# NOTE: change it to read_only=True when querying the dataset
|
138 |
|
139 |
+
# If it is empty, then hydrate otherwise leave it alone
|
140 |
+
print(db)
|
141 |
+
if db is None:
|
142 |
+
print("Dataset doesn't exist, fetching data")
|
143 |
+
try:
|
144 |
+
docs = []
|
145 |
+
for dirpath, dirnames, filenames in os.walk(root_dir):
|
146 |
+
for file in filenames:
|
147 |
+
try:
|
148 |
+
loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
|
149 |
+
docs.extend(loader.load_and_split())
|
150 |
+
except Exception as e:
|
151 |
+
print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
|
152 |
+
pass
|
153 |
+
|
154 |
+
activeloop_username = "aiswaryas"
|
155 |
+
dataset_path = f"hub://{activeloop_username}/" + pathName
|
156 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
157 |
+
texts = text_splitter.split_documents(docs)
|
158 |
+
|
159 |
+
db = DeepLake(dataset_path=dataset_path,
|
160 |
+
embedding_function=embeddings,
|
161 |
+
token=os.environ['ACTIVELOOP_TOKEN'],
|
162 |
+
read_only=False)
|
163 |
+
# Do this in chunks to avoid hitting the ratelimit immediately
|
164 |
+
for i in range(0, len(texts), 500):
|
165 |
+
db.add_documents(texts[i:i+500])
|
166 |
+
time.sleep(.1)
|
167 |
+
|
168 |
+
except Exception as e:
|
169 |
+
return Response(
|
170 |
+
result= "Failed to index github repo",
|
171 |
+
repo="",
|
172 |
+
error=str(e),
|
173 |
+
stdout="",
|
174 |
+
)
|
175 |
+
# print("Dataset not empty. Deleting existing dataset...")
|
176 |
+
# db.ds.delete()
|
177 |
+
# print("Done.")
|
178 |
+
# # Reinitialize
|
179 |
+
# db = DeepLake(dataset_path=dataset_path,
|
180 |
+
# embedding_function=embeddings,
|
181 |
+
# token=os.environ['ACTIVELOOP_TOKEN'], read_only=False)
|
182 |
+
else:
|
183 |
+
print("Dataset already exists")
|
184 |
|
185 |
except Exception as e:
|
186 |
return Response(
|
|
|
190 |
stdout="",
|
191 |
)
|
192 |
|
193 |
+
global ticket_choices, ticket_titles, tickets
|
194 |
+
print("REPO name in bug triage: " + str(repoName))
|
195 |
+
repo = "/".join(repoName[:-4].split("/")[-2:])
|
196 |
+
tickets = fetchGithubIssues(repo, 10)
|
197 |
+
print("tickets: " + str(tickets))
|
198 |
|
199 |
+
# Create the dropdown
|
200 |
+
ticket_choices = {ticket["title"]: ticket for ticket in tickets}
|
201 |
+
ticket_titles = [ticket["title"] for ticket in tickets]
|
|
|
|
|
|
|
|
|
202 |
|
203 |
+
print("Repo name before return: " + str(repoName))
|
|
|
204 |
|
205 |
+
return {
|
206 |
+
success_response: "SUCCESS",
|
207 |
+
# repoName: repoName,
|
208 |
+
# repoTextBox: repoName,
|
209 |
+
# ingestedRepos: ingestedRepos,
|
210 |
+
launch_product: gr.update(visible=True)
|
211 |
+
}
|
212 |
|
213 |
|
214 |
def answer_questions(question: str, github: str, **kwargs) -> Response:
|
215 |
|
216 |
+
global repoName
|
217 |
+
print("Repo name")
|
218 |
+
github = repoName[:-4]
|
219 |
+
print(github)
|
220 |
try:
|
221 |
+
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
222 |
pathName = github.split('/')[-1]
|
223 |
dataset_path = "hub://aiswaryas/" + pathName
|
224 |
|
|
|
241 |
callback_manager=CallbackManager(
|
242 |
[StreamingGradioCallbackHandler(q)]
|
243 |
),
|
244 |
+
openai_api_key="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",
|
245 |
)
|
246 |
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
|
247 |
chat_history = []
|
|
|
263 |
stdout="",
|
264 |
)
|
265 |
|
266 |
+
|
267 |
def fetchGithubIssues(repo: str, num_issues:int, **kwargs) -> Response:
|
268 |
"""
|
269 |
This endpoint should get a list of all the github issues that are open for this repository
|
|
|
287 |
|
288 |
batch.extend(issues.json())
|
289 |
for issue in issues.json():
|
290 |
+
issues_data.append({
|
291 |
+
"issue_url": issue["url"],
|
292 |
+
"title": issue["title"],
|
293 |
+
"body": issue["body"],
|
294 |
+
"comments_url": issue["comments_url"],
|
295 |
+
})
|
296 |
|
297 |
print(issues_data)
|
298 |
return issues_data
|
|
|
322 |
|
323 |
return dirs[0]
|
324 |
|
325 |
+
|
326 |
def generateDocumentationPerFolder(dir, github):
|
327 |
|
328 |
+
if dir == "overview":
|
329 |
+
prompt= """
|
330 |
+
Summarize the structure of the {} repository. Make a list of all endpoints and their behavior. Explain
|
331 |
+
how this module is used in the scope of the larger project. Format the response as code documentation with an
|
332 |
+
Overview, Architecture and Implementation Details. Within implementation details, list out each function and provide
|
333 |
+
an overview of that function.
|
334 |
+
""".format(github)
|
335 |
+
else:
|
336 |
+
prompt= """
|
337 |
+
Summarize how {} is implemented in the {} repository. Make a list of all functions and their behavior. Explain
|
338 |
+
how this module is used in the scope of the larger project. Format the response as code documentation with an
|
339 |
+
Overview, Architecture and Implementation Details. Within implementation details, list out each function and provide
|
340 |
+
an overview of that function.
|
341 |
+
""".format(dir, github)
|
342 |
+
|
343 |
+
print(prompt)
|
344 |
+
try:
|
345 |
+
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
346 |
+
pathName = github.split('/')[-1]
|
347 |
+
print("PATH NAME: " + str(pathName))
|
348 |
+
dataset_path = "hub://aiswaryas/" + pathName
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
|
350 |
+
db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)
|
|
|
351 |
|
352 |
+
# print("finished indexing repo")
|
353 |
+
retriever = db.as_retriever()
|
354 |
+
retriever.search_kwargs['distance_metric'] = 'cos'
|
355 |
+
retriever.search_kwargs['fetch_k'] = 100
|
356 |
+
retriever.search_kwargs['maximal_marginal_relevance'] = True
|
357 |
+
retriever.search_kwargs['k'] = 20
|
358 |
|
359 |
+
# streaming_handler = kwargs.get('streaming_handler')
|
360 |
+
model = ChatOpenAI(
|
361 |
+
model_name='gpt-4',
|
362 |
+
temperature=0.0,
|
363 |
+
verbose=True,
|
364 |
+
streaming=True, # Pass `streaming=True` to make sure the client receives the data.
|
365 |
+
openai_api_key="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",
|
366 |
+
)
|
367 |
+
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
|
368 |
+
chat_history = []
|
369 |
+
return qa({"question": prompt, "chat_history": chat_history})["answer"]
|
370 |
+
|
371 |
+
except Exception as e:
|
372 |
+
print (str(e))
|
373 |
+
return "Failed to generate documentation"
|
374 |
+
|
375 |
+
# history[-1][1] = ""
|
376 |
+
# for char in qa({"question": prompt, "chat_history": chat_history}):
|
377 |
+
# history[-1][1] += char
|
378 |
+
# time.sleep(0.01)
|
379 |
+
# yield history
|
380 |
|
381 |
|
|
|
|
|
|
|
|
|
382 |
|
383 |
|
384 |
def solveGithubIssue(ticket, history) -> Response:
|
385 |
"""
|
386 |
This endpoint takes in a github issue and then queries the db for the question against the codebase.
|
387 |
"""
|
388 |
+
global repoName
|
389 |
print(history)
|
390 |
+
global ticket_choices
|
391 |
+
github = repoName[:-4]
|
392 |
+
|
393 |
repoFolder = github.split("/")[-1]
|
394 |
body = ticket_choices[ticket]["body"]
|
395 |
title = ticket_choices[ticket]["title"]
|
396 |
question = """
|
397 |
+
Given the code in the {} repo, propose a solution for this ticket {} that includes a
|
398 |
+
high level implementation, narrowing down the root cause of the issue and psuedocode if
|
399 |
+
applicable on how to resolve the issue. If multiple changes are required to address the
|
400 |
+
problem, list out each of the steps and a brief explanation for each one.
|
401 |
+
""".format(repoFolder, body)
|
402 |
|
403 |
q_display = """
|
404 |
+
How would I approach solving this ticket: {}. Here is a summary of the issue: {}
|
405 |
+
""".format(title, body)
|
406 |
|
407 |
print(question)
|
408 |
|
409 |
try:
|
410 |
+
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
411 |
pathName = github.split('/')[-1]
|
412 |
dataset_path = "hub://aiswaryas/" + pathName
|
413 |
|
414 |
+
db = DeepLake(dataset_path=dataset_path, read_only=True, embedding=embeddings)
|
415 |
|
416 |
# print("finished indexing repo")
|
417 |
retriever = db.as_retriever()
|
|
|
429 |
callback_manager=CallbackManager(
|
430 |
[StreamingGradioCallbackHandler(q)]
|
431 |
),
|
432 |
+
openai_api_key="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",
|
433 |
)
|
434 |
+
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever,max_tokens_limit=8000)
|
435 |
|
436 |
except Exception as e:
|
437 |
return [[str(e), None]]
|
438 |
|
439 |
history = [[q_display, ""]]
|
440 |
history[-1][1] = ""
|
441 |
+
chat_history = []
|
442 |
+
for char in qa({"question": question, "chat_history": chat_history})["answer"]:
|
443 |
history[-1][1] += char
|
444 |
time.sleep(0.01)
|
445 |
yield history
|
446 |
|
|
|
|
|
447 |
|
448 |
def user(message, history):
|
449 |
return "", history + [[message, None]]
|
450 |
|
451 |
|
452 |
def bot(history, **kwargs):
|
453 |
+
|
454 |
user_message = history[-1][0]
|
455 |
+
|
456 |
+
global repoName
|
457 |
+
print("Repo name in the bot: " + str(repoName))
|
458 |
+
github = repoName[:-4]
|
459 |
try:
|
460 |
+
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
461 |
pathName = github.split('/')[-1]
|
462 |
dataset_path = "hub://aiswaryas/" + pathName
|
463 |
|
|
|
479 |
callback_manager=CallbackManager(
|
480 |
[StreamingGradioCallbackHandler(q)]
|
481 |
),
|
482 |
+
openai_api_key="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",
|
483 |
)
|
484 |
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
|
485 |
chat_history = []
|
|
|
490 |
|
491 |
history[-1][1] = ""
|
492 |
for char in qa({"question": user_message, "chat_history": chat_history})["answer"]:
|
493 |
+
history[-1][1] += char
|
494 |
+
yield history
|
495 |
|
496 |
|
497 |
with gr.Blocks() as demo:
|
498 |
|
499 |
+
# repoName = gr.State(value="https://github.com/sourcegraph/cody.git")
|
|
|
500 |
|
501 |
+
gr.Markdown("""
|
502 |
+
<h1 align="center"> Entelligence AI </h1>
|
503 |
+
<p style="text-align: center; font-size:36">Enabling your product team to ship product 10x faster.</p>
|
504 |
+
""")
|
505 |
|
506 |
repoTextBox = gr.Textbox(label="Github Repository")
|
|
|
|
|
|
|
|
|
507 |
|
508 |
+
gr.Markdown("""Choose from any of the following repositories""")
|
509 |
+
ingestedRepos = gr.CheckboxGroup(choices=['Langchain', 'Weaviate', 'OpenAssistant', 'GenerativeAgents','Llama2', "MemeAI"], label="Github Repository", value="MemeAI")
|
510 |
+
|
511 |
success_response = gr.Textbox(label="")
|
512 |
ingest_btn = gr.Button("Index repo")
|
|
|
513 |
|
514 |
+
with gr.Column(visible=False) as launch_product:
|
515 |
|
516 |
+
# Toggle visibility of the chat, bugs, docs, model windows
|
517 |
+
with gr.Tab("Code Chat"):
|
518 |
+
chatbot = gr.Chatbot()
|
519 |
+
msg = gr.Textbox()
|
520 |
+
clear = gr.Button("Clear")
|
521 |
|
522 |
+
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
523 |
+
bot, chatbot, chatbot
|
524 |
+
)
|
525 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
526 |
|
527 |
|
528 |
+
index = 0
|
529 |
+
with gr.Tab("Bug Triage"):
|
530 |
|
531 |
+
# Display the titles in the dropdown
|
532 |
+
def create_ticket_dropdown(tickets):
|
533 |
|
534 |
+
return gr.Dropdown.update(
|
535 |
+
choices=ticket_titles, value=ticket_titles[0]
|
536 |
+
), gr.update(visible=True)
|
537 |
|
538 |
+
# global ticket_choices, ticket_titles, tickets
|
539 |
+
print("REPO name in bug triage: " + str(repoName))
|
540 |
+
repo = "/".join(repoName[:-4].split("/")[-2:])
|
541 |
+
tickets = fetchGithubIssues(repo, 10)
|
542 |
+
print("tickets: " + str(tickets))
|
543 |
|
544 |
+
# Create the dropdown
|
545 |
+
ticket_choices = {ticket["title"]: ticket for ticket in tickets}
|
546 |
+
ticket_titles = [ticket["title"] for ticket in tickets]
|
|
|
547 |
|
548 |
+
# Here you want to first call the getGithubIssues function
|
549 |
+
# repo = gr.Interface.get_session_state("repo")
|
550 |
+
# print("REPO name in bug triage: " + str(repoName))
|
551 |
+
# repo = "/".join(repoName[:-4].split("/")[-2:])
|
552 |
+
# tickets = fetchGithubIssues(repo, 10)
|
553 |
+
# print("tickets: " + str(tickets))
|
554 |
+
|
555 |
+
# # Create the dropdown
|
556 |
+
# global ticket_choices
|
557 |
+
print("tickets in bug triage: " + str(tickets))
|
558 |
+
ticket_choices = {ticket["title"]: ticket for ticket in tickets}
|
559 |
+
ticket_titles = [ticket["title"] for ticket in tickets]
|
560 |
+
|
561 |
+
ticketDropdown = gr.Dropdown(choices=ticket_titles, title="Github Issues")
|
562 |
+
|
563 |
+
# Extract the ticket title, body for the selected ticket
|
564 |
+
chatbot = gr.Chatbot()
|
565 |
+
msg = gr.Textbox()
|
566 |
+
clear = gr.Button("Clear")
|
567 |
+
|
568 |
+
if index == 0:
|
569 |
+
msg.submit(solveGithubIssue, [ticketDropdown, chatbot], [msg, chatbot], queue=False).then(
|
570 |
+
bot, chatbot, chatbot
|
571 |
+
)
|
572 |
+
ticketDropdown.change(solveGithubIssue, inputs=[ticketDropdown, chatbot], outputs=[chatbot])
|
573 |
+
else:
|
574 |
+
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
575 |
+
bot, chatbot, chatbot
|
576 |
+
)
|
577 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
578 |
+
|
579 |
+
|
580 |
+
with gr.Tab("AI Code Documentation"):
|
581 |
+
|
582 |
+
# global repoName
|
583 |
+
# First parse through the folder structure and store that as a list of clickable buttons
|
584 |
+
gr.Markdown("""
|
585 |
+
## AI Generated Code Documentation
|
586 |
+
|
587 |
+
Code documentation comes in 3 flavors - internal engineering, external API documentation and product documentation. Each offers different layers of abstraction over the code base.
|
588 |
+
""")
|
589 |
+
|
590 |
+
# docs = generateDocumentationPerFolder("overview", repo_name)
|
591 |
+
markdown = gr.Markdown()
|
592 |
|
593 |
+
def button_click_callback(markdown):
|
594 |
+
print("IN BUTTON CLICK CALLBACK")
|
595 |
+
docs = generateDocumentationPerFolder("overview", repoName[:-4])
|
596 |
+
markdown.update(docs)
|
597 |
|
598 |
+
# Generate the left column buttons and their names and wrap each one in a function
|
599 |
+
with gr.Row():
|
600 |
+
with gr.Column(scale=.5, min_width=300):
|
601 |
+
dirNames = generateFolderNamesForRepo(repoName[:-4])
|
602 |
+
print(dirNames)
|
603 |
+
buttons = [gr.Button(folder_name, onclick=button_click_callback) for folder_name in dirNames]
|
604 |
+
for btn, folder_name in zip(buttons, dirNames):
|
605 |
+
btn.click(button_click_callback, [markdown], [markdown] )
|
|
|
|
|
|
|
|
|
606 |
|
607 |
|
608 |
+
# Generate the overall documentation for the main bubble at the same time
|
609 |
+
print("REPO NAME IN DOCS: " + str(repoName[:-4]))
|
610 |
+
with gr.Column(scale=2, min_width=300):
|
611 |
+
docs = generateDocumentationPerFolder("overview", repoName[:-4])
|
612 |
+
markdown.update(docs)
|
613 |
|
614 |
+
# For each folder, generate a diagram and 2-3 prompts that dive deeper into explaining content
|
|
|
615 |
|
|
|
|
|
616 |
|
617 |
+
# Render all the content in the UI
|
618 |
+
|
619 |
+
#
|
620 |
|
621 |
+
with gr.Tab("Custom Model Finetuning"):
|
622 |
+
# First provide a summary of offering
|
623 |
+
gr.Markdown("""
|
624 |
+
# Enterprise Custom Model Finetuning
|
|
|
|
|
625 |
|
626 |
+
Finetuning code generation models directly on your enterprise code base has shown up to 10% increase in model suggestion acceptance rate.
|
627 |
+
""")
|
628 |
+
|
629 |
+
# Choose base model - radio with model size
|
630 |
+
gr.Radio(choices=["Santacoder (1.1B parameter model)", "Incoder (6B parameter model)", "Codegen (16B parameter model)", "Starcoder (15.5B parameter model)"] , value="Starcoder (15.5B parameter model)")
|
631 |
+
|
632 |
+
# Choose existing code base or input a new code base for finetuning -
|
633 |
+
with gr.Row():
|
634 |
+
gr.Markdown("""
|
635 |
+
If you'd like to use the current code base, click this toggle otherwise input the entire code base below.
|
636 |
+
""")
|
637 |
+
existing_repo = gr.Checkbox(value=True, label="Use existing repository")
|
638 |
+
gr.Textbox(label="Input repository", visible=False)
|
639 |
+
|
640 |
+
# Allow option to remove generated files etc etc
|
641 |
+
gr.Markdown("""
|
642 |
+
Finetuned model performance is highly dependent on training data quality. We have currently found that excluding the following file types improves performance. If you'd like to include them, please toggle them.
|
643 |
+
""")
|
644 |
+
file_types = gr.CheckboxGroup(choices=['.bin', '.gen', '.git', '.gz','.jpg', '.lz', '.midi', '.mpq','.png', '.tz'], label="Removed file types")
|
645 |
|
646 |
+
# Based on data above, we should show a field for estimated fine tuning cost
|
647 |
+
# Then we should show the chart for loss
|
648 |
+
def wandb_report(url):
|
649 |
+
iframe = f'<iframe src={url} style="border:none;height:1024px;width:100%">'
|
650 |
+
return gr.HTML(iframe)
|
651 |
|
652 |
+
submit_btn = gr.Button("Start Training")
|
653 |
+
with gr.Column(visible=False) as start_training:
|
654 |
+
# Include the epoch loss table
|
655 |
+
epoch_loss = gr.Dataframe(
|
656 |
+
headers=["Step", "Training Loss", "Validation Loss"],
|
657 |
+
datatype=["number", "number", "number"],
|
658 |
+
row_count=5,
|
659 |
+
col_count=(3, "fixed"),
|
660 |
+
value=[[500, 1.868200, 1.548535], [1000, 1.450100, 1.518277], [1500, 1.659000, 1.486497],
|
661 |
+
[2000, 1.364900, 1.452842], [2500, 1.406300, 1.405151], [3000, 1.276000, 1.346159]]
|
662 |
+
)
|
663 |
|
664 |
+
# After you start training you should see the Wandb report
|
665 |
+
report_url = 'https://wandb.ai/aiswaryasankar/aiswarya-santacoder-finetuning/reports/Aiswarya-Santacoder-Finetuning--Vmlldzo0ODM3MDA4'
|
666 |
+
report = wandb_report(report_url)
|
667 |
|
668 |
+
# Include a playground to compare different models on given tasks
|
669 |
+
# Link to the generated huggingface spaces model if you opt into it
|
670 |
+
# Toggle to select model for the remaining functionality
|
671 |
|
672 |
+
def startTraining(existing_repo, file_types):
|
673 |
+
return {
|
674 |
+
start_training: gr.update(visible=True),
|
675 |
+
}
|
676 |
|
677 |
+
submit_btn.click(
|
678 |
+
startTraining,
|
679 |
+
inputs=[existing_repo, file_types],
|
680 |
+
outputs=[start_training], # report, epoch_loss,
|
681 |
+
)
|
682 |
|
|
|
|
|
683 |
|
684 |
+
ingest_btn.click(fn=index_repo, inputs=[repoTextBox, ingestedRepos], outputs=[success_response, launch_product], api_name="index_repo")
|
685 |
+
|
686 |
+
|
687 |
+
demo.queue()
|
688 |
+
demo.launch(debug=True, share=True)
|
689 |
+
|
690 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
691 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
692 |
|