Aiswarya Sankar commited on
Commit
2613437
·
1 Parent(s): d2e86b9

Update the way the model is chosen

Browse files
Files changed (1) hide show
  1. app.py +347 -272
app.py CHANGED
@@ -15,8 +15,10 @@ import os
15
  from langchain.embeddings.openai import OpenAIEmbeddings
16
  from langchain.vectorstores import DeepLake
17
  import random
 
 
18
 
19
- os.environ['OPENAI_API_KEY']='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'
20
  os.environ['ACTIVELOOP_TOKEN']='eyJhbGciOiJIUzUxMiIsImlhdCI6MTY4MTU5NTgyOCwiZXhwIjoxNzEzMjE4MTU5fQ.eyJpZCI6ImFpc3dhcnlhcyJ9.eoiMFZsS20zzMXXupFbowUlLdgIgf_MA1ck_DByzREeoQvNm8GPhKEfqea2y1Qak-ud2jo9dhSTBTfRe1ztezw'
21
 
22
 
@@ -25,15 +27,13 @@ from langchain.document_loaders import TextLoader
25
  from langchain.text_splitter import CharacterTextSplitter
26
 
27
  import subprocess
28
- repo_name = "https://github.com/aiswaryasankar/memeAI.git"
29
 
30
  from langchain.callbacks.base import BaseCallbackHandler
31
  from langchain.schema import LLMResult
32
  from typing import Any, Union
33
 
34
- global ticket_choices
35
-
36
- job_done = object() # signals the processing is done
37
 
38
  class StreamingGradioCallbackHandler(BaseCallbackHandler):
39
  def __init__(self, q: SimpleQueue):
@@ -80,8 +80,15 @@ class GithubResponse(BaseModel):
80
  repo: str
81
 
82
 
83
- embeddings = OpenAIEmbeddings(disallowed_special=())
 
 
 
 
 
 
84
 
 
85
 
86
  def git_clone(repo_url):
87
  subprocess.run(["git", "clone", repo_url])
@@ -91,46 +98,89 @@ def git_clone(repo_url):
91
  return dirpath
92
 
93
 
94
- def index_repo(repo: str) -> Response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  pathName = git_clone(repo)
96
  root_dir = './' + pathName
97
 
98
- docs = []
99
- for dirpath, dirnames, filenames in os.walk(root_dir):
100
- for file in filenames:
101
- try:
102
- loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
103
- docs.extend(loader.load_and_split())
104
- except Exception as e:
105
- print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
106
- pass
107
-
108
  activeloop_username = "aiswaryas"
109
  dataset_path = f"hub://{activeloop_username}/" + pathName
110
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
111
- texts = text_splitter.split_documents(docs)
112
-
113
- print(texts)
114
- for text in texts:
115
- print(text)
116
 
117
  try:
118
  db = DeepLake(dataset_path=dataset_path,
119
- embedding_function=embeddings,
120
- token=os.environ['ACTIVELOOP_TOKEN'], read_only=False)
 
 
121
  # NOTE: read_only=False because we want to ingest documents
122
  # NOTE: This will raise a `deeplake.util.exceptions.LockedException` if dataset is already locked
123
  # NOTE: change it to read_only=True when querying the dataset
124
 
125
- # Delete dataset if not empty:
126
- if len(db.ds) > 0:
127
- print("Dataset not empty. Deleting existing dataset...")
128
- db.ds.delete()
129
- print("Done.")
130
- # Reinitialize
131
- db = DeepLake(dataset_path=dataset_path,
132
- embedding_function=embeddings,
133
- token=os.environ['ACTIVELOOP_TOKEN'], read_only=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  except Exception as e:
136
  return Response(
@@ -140,29 +190,35 @@ def index_repo(repo: str) -> Response:
140
  stdout="",
141
  )
142
 
143
- try:
144
- db.add_documents(texts)
 
 
 
145
 
146
- except Exception as e:
147
- return Response(
148
- result= "Failed to index github repo",
149
- repo="",
150
- error=str(e),
151
- stdout="",
152
- )
153
 
154
- finally:
155
- db.ds._unlock()
156
 
157
- return "SUCCESS"
 
 
 
 
 
 
158
 
159
 
160
  def answer_questions(question: str, github: str, **kwargs) -> Response:
161
 
162
- global repo_name
163
- github = repo_name[:-4]
 
 
164
  try:
165
- embeddings = OpenAIEmbeddings(openai_api_key="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")
166
  pathName = github.split('/')[-1]
167
  dataset_path = "hub://aiswaryas/" + pathName
168
 
@@ -185,7 +241,7 @@ def answer_questions(question: str, github: str, **kwargs) -> Response:
185
  callback_manager=CallbackManager(
186
  [StreamingGradioCallbackHandler(q)]
187
  ),
188
- openai_api_key="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",
189
  )
190
  qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
191
  chat_history = []
@@ -207,6 +263,7 @@ def answer_questions(question: str, github: str, **kwargs) -> Response:
207
  stdout="",
208
  )
209
 
 
210
  def fetchGithubIssues(repo: str, num_issues:int, **kwargs) -> Response:
211
  """
212
  This endpoint should get a list of all the github issues that are open for this repository
@@ -230,12 +287,12 @@ def fetchGithubIssues(repo: str, num_issues:int, **kwargs) -> Response:
230
 
231
  batch.extend(issues.json())
232
  for issue in issues.json():
233
- issues_data.append({
234
- "issue_url": issue["url"],
235
- "title": issue["title"],
236
- "body": issue["body"],
237
- "comments_url": issue["comments_url"],
238
- })
239
 
240
  print(issues_data)
241
  return issues_data
@@ -265,97 +322,96 @@ def generateFolderNamesForRepo(repo):
265
 
266
  return dirs[0]
267
 
 
268
  def generateDocumentationPerFolder(dir, github):
269
 
270
- if dir == "overview":
271
- prompt= """
272
- Summarize the structure of the memeAI repository. Make a list of all endpoints and their behavior. Explain
273
- how this module is used in the scope of the larger project. Format the response as code documentation with an
274
- Overview, Architecture and Implementation Details. Within implementation details, list out each function and provide
275
- an overview of that function.
276
- """.format(dir)
277
- else:
278
- prompt= """
279
- Summarize how {} is implemented in the memeAI repository. Make a list of all functions and their behavior. Explain
280
- how this module is used in the scope of the larger project. Format the response as code documentation with an
281
- Overview, Architecture and Implementation Details. Within implementation details, list out each function and provide
282
- an overview of that function.
283
- """.format(dir)
284
-
285
- print(prompt)
286
- try:
287
- embeddings = OpenAIEmbeddings(openai_api_key="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")
288
- pathName = github.split('/')[-1]
289
- dataset_path = "hub://aiswaryas/" + pathName
290
-
291
- db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)
292
-
293
- # print("finished indexing repo")
294
- retriever = db.as_retriever()
295
- retriever.search_kwargs['distance_metric'] = 'cos'
296
- retriever.search_kwargs['fetch_k'] = 100
297
- retriever.search_kwargs['maximal_marginal_relevance'] = True
298
- retriever.search_kwargs['k'] = 20
299
-
300
- # streaming_handler = kwargs.get('streaming_handler')
301
- model = ChatOpenAI(
302
- model_name='gpt-4',
303
- temperature=0.0,
304
- verbose=True,
305
- streaming=True, # Pass `streaming=True` to make sure the client receives the data.
306
- openai_api_key="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",
307
- )
308
- qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
309
- chat_history = []
310
 
311
- except Exception as e:
312
- return str(e)
313
 
314
- # history[-1][1] = ""
315
- # for char in qa({"question": prompt, "chat_history": chat_history}):
316
- # history[-1][1] += char
317
- # time.sleep(0.01)
318
- # yield history
 
319
 
320
- return qa({"question": prompt, "chat_history": chat_history})["answer"]
321
- return response["answer"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
 
324
- def generateArchitectureDiagram(folder) -> Response:
325
- """
326
- This endpoint should generate a Mermaid diagram for the given input files. It will return the
327
- """
328
 
329
 
330
  def solveGithubIssue(ticket, history) -> Response:
331
  """
332
  This endpoint takes in a github issue and then queries the db for the question against the codebase.
333
  """
 
334
  print(history)
335
- global repo_name, ticket_choices
336
- github = repo_name[:-4]
 
337
  repoFolder = github.split("/")[-1]
338
  body = ticket_choices[ticket]["body"]
339
  title = ticket_choices[ticket]["title"]
340
  question = """
341
- Given the code in the {} repo, propose a solution for this ticket {} that includes a
342
- high level implementation, narrowing down the root cause of the issue and psuedocode if
343
- applicable on how to resolve the issue. If multiple changes are required to address the
344
- problem, list out each of the steps and a brief explanation for each one.
345
- """.format(repoFolder, body)
346
 
347
  q_display = """
348
- How would I approach solving this ticket: {}. Here is a summary of the issue: {}
349
- """.format(title, body)
350
 
351
  print(question)
352
 
353
  try:
354
- embeddings = OpenAIEmbeddings(openai_api_key="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")
355
  pathName = github.split('/')[-1]
356
  dataset_path = "hub://aiswaryas/" + pathName
357
 
358
- db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)
359
 
360
  # print("finished indexing repo")
361
  retriever = db.as_retriever()
@@ -373,34 +429,35 @@ def solveGithubIssue(ticket, history) -> Response:
373
  callback_manager=CallbackManager(
374
  [StreamingGradioCallbackHandler(q)]
375
  ),
376
- openai_api_key="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",
377
  )
378
- qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
379
 
380
  except Exception as e:
381
  return [[str(e), None]]
382
 
383
  history = [[q_display, ""]]
384
  history[-1][1] = ""
385
- for char in qa({"question": prompt, "chat_history": chat_history}):
 
386
  history[-1][1] += char
387
  time.sleep(0.01)
388
  yield history
389
 
390
- # return [[qa({"question": question, "chat_history": chat_history})["answer"], None]]
391
-
392
 
393
  def user(message, history):
394
  return "", history + [[message, None]]
395
 
396
 
397
  def bot(history, **kwargs):
398
- print(history)
399
  user_message = history[-1][0]
400
- global repo_name
401
- github = repo_name[:-4]
 
 
402
  try:
403
- embeddings = OpenAIEmbeddings(openai_api_key="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")
404
  pathName = github.split('/')[-1]
405
  dataset_path = "hub://aiswaryas/" + pathName
406
 
@@ -422,7 +479,7 @@ def bot(history, **kwargs):
422
  callback_manager=CallbackManager(
423
  [StreamingGradioCallbackHandler(q)]
424
  ),
425
- openai_api_key="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",
426
  )
427
  qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
428
  chat_history = []
@@ -433,185 +490,203 @@ def bot(history, **kwargs):
433
 
434
  history[-1][1] = ""
435
  for char in qa({"question": user_message, "chat_history": chat_history})["answer"]:
436
- history[-1][1] += char
437
- yield history
438
 
439
 
440
  with gr.Blocks() as demo:
441
 
442
- gr.Markdown("""
443
- # Entelligence AI
444
 
445
- Enabling your product team to ship product 10x faster.
446
- """)
 
 
447
 
448
  repoTextBox = gr.Textbox(label="Github Repository")
449
- repo_name = "https://github.com/aiswaryasankar/memeAI.git"
450
- # def update_state(value):
451
- # repo_name.value = value
452
- # return value
453
 
454
- # repoTextBox.change(update_state, repoTextBox)
455
- # print(repo_name.value)
 
456
  success_response = gr.Textbox(label="")
457
  ingest_btn = gr.Button("Index repo")
458
- ingest_btn.click(fn=index_repo, inputs=repoTextBox, outputs=success_response, api_name="index_repo")
459
 
 
460
 
461
- # Toggle visibility of the chat, bugs, docs, model windows
462
- with gr.Tab("Code Chat"):
463
- chatbot = gr.Chatbot()
464
- msg = gr.Textbox()
465
- clear = gr.Button("Clear")
466
 
467
- msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
468
- bot, chatbot, chatbot
469
- )
470
- clear.click(lambda: None, None, chatbot, queue=False)
471
 
472
 
473
- index = 0
474
- with gr.Tab("Bug Triage"):
475
 
476
- # Display the titles in the dropdown
477
- def create_ticket_dropdown(tickets):
478
 
479
- return gr.Dropdown.update(
480
- choices=titles, value=titles[0]
481
- ), gr.update(visible=True)
482
 
483
- # Here you want to first call the getGithubIssues function
484
- # repo = gr.Interface.get_session_state("repo")
485
- print(repo_name)
486
- repo = "/".join(repo_name[:-4].split("/")[-2:])
487
- tickets = fetchGithubIssues(repo, 10)
488
 
489
- # Create the dropdown
490
- global ticket_choices
491
- ticket_choices = {ticket["title"]: ticket for ticket in tickets}
492
- ticket_titles = [ticket["title"] for ticket in tickets]
493
 
494
- ticketDropdown = gr.Dropdown(choices=ticket_titles, title="Github Issues")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
- # Extract the ticket title, body for the selected ticket
497
- chatbot = gr.Chatbot()
498
- msg = gr.Textbox()
499
- clear = gr.Button("Clear")
500
 
501
- if index == 0:
502
- msg.submit(solveGithubIssue, [ticketDropdown, chatbot], [msg, chatbot], queue=False).then(
503
- bot, chatbot, chatbot
504
- )
505
- ticketDropdown.change(solveGithubIssue, inputs=[ticketDropdown, chatbot], outputs=[chatbot])
506
- index += 1
507
- else:
508
- msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
509
- bot, chatbot, chatbot
510
- )
511
- index += 1
512
- clear.click(lambda: None, None, chatbot, queue=False)
513
 
514
 
515
- with gr.Tab("AI Code Documentation"):
516
- # First parse through the folder structure and store that as a list of clickable buttons
517
- gr.Markdown("""
518
- ## AI Generated Code Documentation
 
519
 
520
- Code documentation comes in 3 flavors - internal engineering, external API documentation and product documentation. Each offers different layers of abstraction over the code base.
521
- """)
522
 
523
- # docs = generateDocumentationPerFolder("overview", repo_name)
524
- # markdown = gr.Markdown(value=docs)
525
 
526
- def button_click_callback(label):
527
- docs = generateDocumentationPerFolder(label, repo_name[:-4])
528
- markdown.update(docs)
529
 
530
- # Generate the left column buttons and their names and wrap each one in a function
531
- with gr.Row():
532
- with gr.Column(scale=.5, min_width=300):
533
- dirNames = generateFolderNamesForRepo(repo_name[:-4])
534
- print(dirNames)
535
- buttons = [gr.Button(folder_name, onclick=button_click_callback) for folder_name in dirNames]
536
 
537
- # Generate the overall documentation for the main bubble at the same time
538
- with gr.Column(scale=2, min_width=300):
539
- docs = generateDocumentationPerFolder("overview", repo_name[:-4])
540
- markdown = gr.Markdown(value=docs)
541
- # markdown.update(docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
 
 
 
 
 
543
 
544
- # For each folder, generate a diagram and 2-3 prompts that dive deeper into explaining content
 
 
 
 
 
 
 
 
 
 
545
 
 
 
 
546
 
547
- # Render all the content in the UI
 
 
548
 
549
- #
 
 
 
550
 
551
- with gr.Tab("Custom Model Finetuning"):
552
- # First provide a summary of offering
553
- gr.Markdown("""
554
- ## Enterprise Custom Model Finetuning
 
555
 
556
- Finetuning code generation models directly on your enterprise code base has shown up to 10% increase in model suggestion acceptance rate.
557
- """)
558
 
559
- # Choose base model - radio with model size
560
- gr.Radio(choices=["Santacoder (1.1B parameter model)", "Incoder (6B parameter model)", "Codegen (16B parameter model)", "Starcoder (15.5B parameter model)"] , value="Starcoder (15.5B parameter model)")
 
 
 
 
561
 
562
- # Choose existing code base or input a new code base for finetuning -
563
- with gr.Row():
564
- gr.Markdown("""
565
- If you'd like to use the current code base, click this toggle otherwise input the entire code base below.
566
- """)
567
- existing_repo = gr.Checkbox(value=True, label="Use existing repository")
568
- gr.Textbox(label="Input repository", visible=False)
569
-
570
- # Allow option to remove generated files etc
571
- gr.Markdown("""
572
- Finetuned model performance is highly dependent on training data quality. We have currently found that excluding the following file types improves performance. If you'd like to include them, please toggle them.
573
- """)
574
- file_types = gr.CheckboxGroup(choices=['.bin', '.gen', '.git', '.gz','.jpg', '.lz', '.midi', '.mpq','.png', '.tz'], label="Removed file types")
575
-
576
- # Based on data above, we should show a field for estimated fine tuning cost
577
- # Then we should show the chart for loss
578
- def wandb_report(url):
579
- iframe = f'<iframe src={url} style="border:none;height:1024px;width:100%">'
580
- return gr.HTML(iframe)
581
-
582
- submit_btn = gr.Button("Start Training")
583
- with gr.Column(visible=False) as start_training:
584
- # Include the epoch loss table
585
- epoch_loss = gr.Dataframe(
586
- headers=["Step", "Training Loss", "Validation Loss"],
587
- datatype=["number", "number", "number"],
588
- row_count=5,
589
- col_count=(3, "fixed"),
590
- value=[[500, 1.868200, 1.548535], [1000, 1.450100, 1.518277], [1500, 1.659000, 1.486497],
591
- [2000, 1.364900, 1.452842], [2500, 1.406300, 1.405151], [3000, 1.276000, 1.346159]]
592
- )
593
 
594
- # After you start training you should see the Wandb report
595
- report_url = 'https://wandb.ai/aiswaryasankar/aiswarya-santacoder-finetuning/reports/Aiswarya-Santacoder-Finetuning--Vmlldzo0ODM3MDA4'
596
- report = wandb_report(report_url)
597
-
598
- # Include a playground to compare different models on given tasks
599
- # Link to the generated huggingface spaces model if you opt into it
600
- # Toggle to select model for the remaining functionality
601
-
602
- def startTraining(): # existing_repo, file_types
603
- start_training= gr.update(visible=True)
604
- # return {
605
- # report: report,
606
- # epoch_loss: epoch_loss,
607
- # start_training: gr.update(visible=True),
608
- # }
609
-
610
- submit_btn.click(
611
- startTraining,
612
- # inputs=[existing_repo, file_types],
613
- # outputs=[start_training], # report, epoch_loss,
614
- )
615
-
616
- demo.launch(debug=True)
617
 
 
15
  from langchain.embeddings.openai import OpenAIEmbeddings
16
  from langchain.vectorstores import DeepLake
17
  import random
18
+ import time
19
+ import together
20
 
21
+ os.environ['OPENAI_API_KEY']='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'
22
  os.environ['ACTIVELOOP_TOKEN']='eyJhbGciOiJIUzUxMiIsImlhdCI6MTY4MTU5NTgyOCwiZXhwIjoxNzEzMjE4MTU5fQ.eyJpZCI6ImFpc3dhcnlhcyJ9.eoiMFZsS20zzMXXupFbowUlLdgIgf_MA1ck_DByzREeoQvNm8GPhKEfqea2y1Qak-ud2jo9dhSTBTfRe1ztezw'
23
 
24
 
 
27
  from langchain.text_splitter import CharacterTextSplitter
28
 
29
  import subprocess
30
+ # repo_name = "https://github.com/aiswaryasankar/memeAI.git"
31
 
32
  from langchain.callbacks.base import BaseCallbackHandler
33
  from langchain.schema import LLMResult
34
  from typing import Any, Union
35
 
36
+ job_done = object()
 
 
37
 
38
  class StreamingGradioCallbackHandler(BaseCallbackHandler):
39
  def __init__(self, q: SimpleQueue):
 
80
  repo: str
81
 
82
 
83
+ # global repoName
84
+ global ticket_titles
85
+ global tickets
86
+ global ticket_choices
87
+ tickets = []
88
+
89
+ repoName = "https://github.com/aiswaryasankar/memeAI.git"
90
 
91
+ embeddings = OpenAIEmbeddings(disallowed_special=())
92
 
93
  def git_clone(repo_url):
94
  subprocess.run(["git", "clone", repo_url])
 
98
  return dirpath
99
 
100
 
101
+ def index_repo(textbox: str, dropdown: str) -> Response:
102
+
103
+ mapping = {
104
+ "Langchain" : "https://github.com/langchain-ai/langchain.git",
105
+ "Weaviate": "https://github.com/weaviate/weaviate.git",
106
+ "Llama2": "https://github.com/facebookresearch/llama.git",
107
+ "OpenAssistant": "https://github.com/LAION-AI/Open-Assistant.git",
108
+ "MemeAI": "https://github.com/aiswaryasankar/memeAI.git",
109
+ "GenerativeAgents": "https://github.com/joonspk-research/generative_agents.git"
110
+ }
111
+
112
+ # print(textbox)
113
+ # print(dropdown[0])
114
+
115
+ if textbox != "":
116
+ repo = textbox
117
+ else:
118
+ repo = mapping[dropdown[0]]
119
+ # repoName = gr.State(repo)
120
+
121
+ print("Repo name after setting the value: " + str(repoName))
122
  pathName = git_clone(repo)
123
  root_dir = './' + pathName
124
 
125
+ print("Repo name after setting the value: " + str(repoName))
 
 
 
 
 
 
 
 
 
126
  activeloop_username = "aiswaryas"
127
  dataset_path = f"hub://{activeloop_username}/" + pathName
 
 
 
 
 
 
128
 
129
  try:
130
  db = DeepLake(dataset_path=dataset_path,
131
+ embedding_function=embeddings,
132
+ token=os.environ['ACTIVELOOP_TOKEN'],
133
+ read_only=True,
134
+ num_workers=10)
135
  # NOTE: read_only=False because we want to ingest documents
136
  # NOTE: This will raise a `deeplake.util.exceptions.LockedException` if dataset is already locked
137
  # NOTE: change it to read_only=True when querying the dataset
138
 
139
+ # If it is empty, then hydrate otherwise leave it alone
140
+ print(db)
141
+ if db is None:
142
+ print("Dataset doesn't exist, fetching data")
143
+ try:
144
+ docs = []
145
+ for dirpath, dirnames, filenames in os.walk(root_dir):
146
+ for file in filenames:
147
+ try:
148
+ loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
149
+ docs.extend(loader.load_and_split())
150
+ except Exception as e:
151
+ print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
152
+ pass
153
+
154
+ activeloop_username = "aiswaryas"
155
+ dataset_path = f"hub://{activeloop_username}/" + pathName
156
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
157
+ texts = text_splitter.split_documents(docs)
158
+
159
+ db = DeepLake(dataset_path=dataset_path,
160
+ embedding_function=embeddings,
161
+ token=os.environ['ACTIVELOOP_TOKEN'],
162
+ read_only=False)
163
+ # Do this in chunks to avoid hitting the ratelimit immediately
164
+ for i in range(0, len(texts), 500):
165
+ db.add_documents(texts[i:i+500])
166
+ time.sleep(.1)
167
+
168
+ except Exception as e:
169
+ return Response(
170
+ result= "Failed to index github repo",
171
+ repo="",
172
+ error=str(e),
173
+ stdout="",
174
+ )
175
+ # print("Dataset not empty. Deleting existing dataset...")
176
+ # db.ds.delete()
177
+ # print("Done.")
178
+ # # Reinitialize
179
+ # db = DeepLake(dataset_path=dataset_path,
180
+ # embedding_function=embeddings,
181
+ # token=os.environ['ACTIVELOOP_TOKEN'], read_only=False)
182
+ else:
183
+ print("Dataset already exists")
184
 
185
  except Exception as e:
186
  return Response(
 
190
  stdout="",
191
  )
192
 
193
+ global ticket_choices, ticket_titles, tickets
194
+ print("REPO name in bug triage: " + str(repoName))
195
+ repo = "/".join(repoName[:-4].split("/")[-2:])
196
+ tickets = fetchGithubIssues(repo, 10)
197
+ print("tickets: " + str(tickets))
198
 
199
+ # Create the dropdown
200
+ ticket_choices = {ticket["title"]: ticket for ticket in tickets}
201
+ ticket_titles = [ticket["title"] for ticket in tickets]
 
 
 
 
202
 
203
+ print("Repo name before return: " + str(repoName))
 
204
 
205
+ return {
206
+ success_response: "SUCCESS",
207
+ # repoName: repoName,
208
+ # repoTextBox: repoName,
209
+ # ingestedRepos: ingestedRepos,
210
+ launch_product: gr.update(visible=True)
211
+ }
212
 
213
 
214
  def answer_questions(question: str, github: str, **kwargs) -> Response:
215
 
216
+ global repoName
217
+ print("Repo name")
218
+ github = repoName[:-4]
219
+ print(github)
220
  try:
221
+ embeddings = OpenAIEmbeddings(openai_api_key="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")
222
  pathName = github.split('/')[-1]
223
  dataset_path = "hub://aiswaryas/" + pathName
224
 
 
241
  callback_manager=CallbackManager(
242
  [StreamingGradioCallbackHandler(q)]
243
  ),
244
+ openai_api_key="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",
245
  )
246
  qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
247
  chat_history = []
 
263
  stdout="",
264
  )
265
 
266
+
267
  def fetchGithubIssues(repo: str, num_issues:int, **kwargs) -> Response:
268
  """
269
  This endpoint should get a list of all the github issues that are open for this repository
 
287
 
288
  batch.extend(issues.json())
289
  for issue in issues.json():
290
+ issues_data.append({
291
+ "issue_url": issue["url"],
292
+ "title": issue["title"],
293
+ "body": issue["body"],
294
+ "comments_url": issue["comments_url"],
295
+ })
296
 
297
  print(issues_data)
298
  return issues_data
 
322
 
323
  return dirs[0]
324
 
325
+
326
  def generateDocumentationPerFolder(dir, github):
327
 
328
+ if dir == "overview":
329
+ prompt= """
330
+ Summarize the structure of the {} repository. Make a list of all endpoints and their behavior. Explain
331
+ how this module is used in the scope of the larger project. Format the response as code documentation with an
332
+ Overview, Architecture and Implementation Details. Within implementation details, list out each function and provide
333
+ an overview of that function.
334
+ """.format(github)
335
+ else:
336
+ prompt= """
337
+ Summarize how {} is implemented in the {} repository. Make a list of all functions and their behavior. Explain
338
+ how this module is used in the scope of the larger project. Format the response as code documentation with an
339
+ Overview, Architecture and Implementation Details. Within implementation details, list out each function and provide
340
+ an overview of that function.
341
+ """.format(dir, github)
342
+
343
+ print(prompt)
344
+ try:
345
+ embeddings = OpenAIEmbeddings(openai_api_key="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")
346
+ pathName = github.split('/')[-1]
347
+ print("PATH NAME: " + str(pathName))
348
+ dataset_path = "hub://aiswaryas/" + pathName
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
+ db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)
 
351
 
352
+ # print("finished indexing repo")
353
+ retriever = db.as_retriever()
354
+ retriever.search_kwargs['distance_metric'] = 'cos'
355
+ retriever.search_kwargs['fetch_k'] = 100
356
+ retriever.search_kwargs['maximal_marginal_relevance'] = True
357
+ retriever.search_kwargs['k'] = 20
358
 
359
+ # streaming_handler = kwargs.get('streaming_handler')
360
+ model = ChatOpenAI(
361
+ model_name='gpt-4',
362
+ temperature=0.0,
363
+ verbose=True,
364
+ streaming=True, # Pass `streaming=True` to make sure the client receives the data.
365
+ openai_api_key="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",
366
+ )
367
+ qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
368
+ chat_history = []
369
+ return qa({"question": prompt, "chat_history": chat_history})["answer"]
370
+
371
+ except Exception as e:
372
+ print (str(e))
373
+ return "Failed to generate documentation"
374
+
375
+ # history[-1][1] = ""
376
+ # for char in qa({"question": prompt, "chat_history": chat_history}):
377
+ # history[-1][1] += char
378
+ # time.sleep(0.01)
379
+ # yield history
380
 
381
 
 
 
 
 
382
 
383
 
384
  def solveGithubIssue(ticket, history) -> Response:
385
  """
386
  This endpoint takes in a github issue and then queries the db for the question against the codebase.
387
  """
388
+ global repoName
389
  print(history)
390
+ global ticket_choices
391
+ github = repoName[:-4]
392
+
393
  repoFolder = github.split("/")[-1]
394
  body = ticket_choices[ticket]["body"]
395
  title = ticket_choices[ticket]["title"]
396
  question = """
397
+ Given the code in the {} repo, propose a solution for this ticket {} that includes a
398
+ high level implementation, narrowing down the root cause of the issue and psuedocode if
399
+ applicable on how to resolve the issue. If multiple changes are required to address the
400
+ problem, list out each of the steps and a brief explanation for each one.
401
+ """.format(repoFolder, body)
402
 
403
  q_display = """
404
+ How would I approach solving this ticket: {}. Here is a summary of the issue: {}
405
+ """.format(title, body)
406
 
407
  print(question)
408
 
409
  try:
410
+ embeddings = OpenAIEmbeddings(openai_api_key="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")
411
  pathName = github.split('/')[-1]
412
  dataset_path = "hub://aiswaryas/" + pathName
413
 
414
+ db = DeepLake(dataset_path=dataset_path, read_only=True, embedding=embeddings)
415
 
416
  # print("finished indexing repo")
417
  retriever = db.as_retriever()
 
429
  callback_manager=CallbackManager(
430
  [StreamingGradioCallbackHandler(q)]
431
  ),
432
+ openai_api_key="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",
433
  )
434
+ qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever,max_tokens_limit=8000)
435
 
436
  except Exception as e:
437
  return [[str(e), None]]
438
 
439
  history = [[q_display, ""]]
440
  history[-1][1] = ""
441
+ chat_history = []
442
+ for char in qa({"question": question, "chat_history": chat_history})["answer"]:
443
  history[-1][1] += char
444
  time.sleep(0.01)
445
  yield history
446
 
 
 
447
 
448
  def user(message, history):
449
  return "", history + [[message, None]]
450
 
451
 
452
  def bot(history, **kwargs):
453
+
454
  user_message = history[-1][0]
455
+
456
+ global repoName
457
+ print("Repo name in the bot: " + str(repoName))
458
+ github = repoName[:-4]
459
  try:
460
+ embeddings = OpenAIEmbeddings(openai_api_key="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")
461
  pathName = github.split('/')[-1]
462
  dataset_path = "hub://aiswaryas/" + pathName
463
 
 
479
  callback_manager=CallbackManager(
480
  [StreamingGradioCallbackHandler(q)]
481
  ),
482
+ openai_api_key="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",
483
  )
484
  qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)
485
  chat_history = []
 
490
 
491
  history[-1][1] = ""
492
  for char in qa({"question": user_message, "chat_history": chat_history})["answer"]:
493
+ history[-1][1] += char
494
+ yield history
495
 
496
 
497
  with gr.Blocks() as demo:
498
 
499
+ # repoName = gr.State(value="https://github.com/sourcegraph/cody.git")
 
500
 
501
+ gr.Markdown("""
502
+ <h1 align="center"> Entelligence AI </h1>
503
+ <p style="text-align: center; font-size:36">Enabling your product team to ship product 10x faster.</p>
504
+ """)
505
 
506
  repoTextBox = gr.Textbox(label="Github Repository")
 
 
 
 
507
 
508
+ gr.Markdown("""Choose from any of the following repositories""")
509
+ ingestedRepos = gr.CheckboxGroup(choices=['Langchain', 'Weaviate', 'OpenAssistant', 'GenerativeAgents','Llama2', "MemeAI"], label="Github Repository", value="MemeAI")
510
+
511
  success_response = gr.Textbox(label="")
512
  ingest_btn = gr.Button("Index repo")
 
513
 
514
+ with gr.Column(visible=False) as launch_product:
515
 
516
+ # Toggle visibility of the chat, bugs, docs, model windows
517
+ with gr.Tab("Code Chat"):
518
+ chatbot = gr.Chatbot()
519
+ msg = gr.Textbox()
520
+ clear = gr.Button("Clear")
521
 
522
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
523
+ bot, chatbot, chatbot
524
+ )
525
+ clear.click(lambda: None, None, chatbot, queue=False)
526
 
527
 
528
+ index = 0
529
+ with gr.Tab("Bug Triage"):
530
 
531
+ # Display the titles in the dropdown
532
+ def create_ticket_dropdown(tickets):
533
 
534
+ return gr.Dropdown.update(
535
+ choices=ticket_titles, value=ticket_titles[0]
536
+ ), gr.update(visible=True)
537
 
538
+ # global ticket_choices, ticket_titles, tickets
539
+ print("REPO name in bug triage: " + str(repoName))
540
+ repo = "/".join(repoName[:-4].split("/")[-2:])
541
+ tickets = fetchGithubIssues(repo, 10)
542
+ print("tickets: " + str(tickets))
543
 
544
+ # Create the dropdown
545
+ ticket_choices = {ticket["title"]: ticket for ticket in tickets}
546
+ ticket_titles = [ticket["title"] for ticket in tickets]
 
547
 
548
+ # Here you want to first call the getGithubIssues function
549
+ # repo = gr.Interface.get_session_state("repo")
550
+ # print("REPO name in bug triage: " + str(repoName))
551
+ # repo = "/".join(repoName[:-4].split("/")[-2:])
552
+ # tickets = fetchGithubIssues(repo, 10)
553
+ # print("tickets: " + str(tickets))
554
+
555
+ # # Create the dropdown
556
+ # global ticket_choices
557
+ print("tickets in bug triage: " + str(tickets))
558
+ ticket_choices = {ticket["title"]: ticket for ticket in tickets}
559
+ ticket_titles = [ticket["title"] for ticket in tickets]
560
+
561
+ ticketDropdown = gr.Dropdown(choices=ticket_titles, title="Github Issues")
562
+
563
+ # Extract the ticket title, body for the selected ticket
564
+ chatbot = gr.Chatbot()
565
+ msg = gr.Textbox()
566
+ clear = gr.Button("Clear")
567
+
568
+ if index == 0:
569
+ msg.submit(solveGithubIssue, [ticketDropdown, chatbot], [msg, chatbot], queue=False).then(
570
+ bot, chatbot, chatbot
571
+ )
572
+ ticketDropdown.change(solveGithubIssue, inputs=[ticketDropdown, chatbot], outputs=[chatbot])
573
+ else:
574
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
575
+ bot, chatbot, chatbot
576
+ )
577
+ clear.click(lambda: None, None, chatbot, queue=False)
578
+
579
+
580
+ with gr.Tab("AI Code Documentation"):
581
+
582
+ # global repoName
583
+ # First parse through the folder structure and store that as a list of clickable buttons
584
+ gr.Markdown("""
585
+ ## AI Generated Code Documentation
586
+
587
+ Code documentation comes in 3 flavors - internal engineering, external API documentation and product documentation. Each offers different layers of abstraction over the code base.
588
+ """)
589
+
590
+ # docs = generateDocumentationPerFolder("overview", repo_name)
591
+ markdown = gr.Markdown()
592
 
593
+ def button_click_callback(markdown):
594
+ print("IN BUTTON CLICK CALLBACK")
595
+ docs = generateDocumentationPerFolder("overview", repoName[:-4])
596
+ markdown.update(docs)
597
 
598
+ # Generate the left column buttons and their names and wrap each one in a function
599
+ with gr.Row():
600
+ with gr.Column(scale=.5, min_width=300):
601
+ dirNames = generateFolderNamesForRepo(repoName[:-4])
602
+ print(dirNames)
603
+ buttons = [gr.Button(folder_name, onclick=button_click_callback) for folder_name in dirNames]
604
+ for btn, folder_name in zip(buttons, dirNames):
605
+ btn.click(button_click_callback, [markdown], [markdown] )
 
 
 
 
606
 
607
 
608
+ # Generate the overall documentation for the main bubble at the same time
609
+ print("REPO NAME IN DOCS: " + str(repoName[:-4]))
610
+ with gr.Column(scale=2, min_width=300):
611
+ docs = generateDocumentationPerFolder("overview", repoName[:-4])
612
+ markdown.update(docs)
613
 
614
+ # For each folder, generate a diagram and 2-3 prompts that dive deeper into explaining content
 
615
 
 
 
616
 
617
+ # Render all the content in the UI
618
+
619
+ #
620
 
621
+ with gr.Tab("Custom Model Finetuning"):
622
+ # First provide a summary of offering
623
+ gr.Markdown("""
624
+ # Enterprise Custom Model Finetuning
 
 
625
 
626
+ Finetuning code generation models directly on your enterprise code base has shown up to 10% increase in model suggestion acceptance rate.
627
+ """)
628
+
629
+ # Choose base model - radio with model size
630
+ gr.Radio(choices=["Santacoder (1.1B parameter model)", "Incoder (6B parameter model)", "Codegen (16B parameter model)", "Starcoder (15.5B parameter model)"] , value="Starcoder (15.5B parameter model)")
631
+
632
+ # Choose existing code base or input a new code base for finetuning -
633
+ with gr.Row():
634
+ gr.Markdown("""
635
+ If you'd like to use the current code base, click this toggle otherwise input the entire code base below.
636
+ """)
637
+ existing_repo = gr.Checkbox(value=True, label="Use existing repository")
638
+ gr.Textbox(label="Input repository", visible=False)
639
+
640
+ # Allow option to remove generated files etc etc
641
+ gr.Markdown("""
642
+ Finetuned model performance is highly dependent on training data quality. We have currently found that excluding the following file types improves performance. If you'd like to include them, please toggle them.
643
+ """)
644
+ file_types = gr.CheckboxGroup(choices=['.bin', '.gen', '.git', '.gz','.jpg', '.lz', '.midi', '.mpq','.png', '.tz'], label="Removed file types")
645
 
646
+ # Based on data above, we should show a field for estimated fine tuning cost
647
+ # Then we should show the chart for loss
648
+ def wandb_report(url):
649
+ iframe = f'<iframe src={url} style="border:none;height:1024px;width:100%">'
650
+ return gr.HTML(iframe)
651
 
652
+ submit_btn = gr.Button("Start Training")
653
+ with gr.Column(visible=False) as start_training:
654
+ # Include the epoch loss table
655
+ epoch_loss = gr.Dataframe(
656
+ headers=["Step", "Training Loss", "Validation Loss"],
657
+ datatype=["number", "number", "number"],
658
+ row_count=5,
659
+ col_count=(3, "fixed"),
660
+ value=[[500, 1.868200, 1.548535], [1000, 1.450100, 1.518277], [1500, 1.659000, 1.486497],
661
+ [2000, 1.364900, 1.452842], [2500, 1.406300, 1.405151], [3000, 1.276000, 1.346159]]
662
+ )
663
 
664
+ # After you start training you should see the Wandb report
665
+ report_url = 'https://wandb.ai/aiswaryasankar/aiswarya-santacoder-finetuning/reports/Aiswarya-Santacoder-Finetuning--Vmlldzo0ODM3MDA4'
666
+ report = wandb_report(report_url)
667
 
668
+ # Include a playground to compare different models on given tasks
669
+ # Link to the generated huggingface spaces model if you opt into it
670
+ # Toggle to select model for the remaining functionality
671
 
672
+ def startTraining(existing_repo, file_types):
673
+ return {
674
+ start_training: gr.update(visible=True),
675
+ }
676
 
677
+ submit_btn.click(
678
+ startTraining,
679
+ inputs=[existing_repo, file_types],
680
+ outputs=[start_training], # report, epoch_loss,
681
+ )
682
 
 
 
683
 
684
+ ingest_btn.click(fn=index_repo, inputs=[repoTextBox, ingestedRepos], outputs=[success_response, launch_product], api_name="index_repo")
685
+
686
+
687
+ demo.queue()
688
+ demo.launch(debug=True, share=True)
689
+
690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692