zinoubm commited on
Commit
b678100
·
1 Parent(s): db68fe3

'prototype'

Browse files
chat.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ from dotenv import load_dotenv
4
+ import jsonlines
5
+ from pathlib import Path
6
+ from utils import (
7
+ gpt3_embeddings,
8
+ gpt3_completion,
9
+ dot_similarity,
10
+ load_prompt,
11
+ )
12
+
13
+ load_dotenv()
14
+
15
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
+
17
+ openai.api_key = OPENAI_API_KEY
18
+
19
+
20
+ def search_index(question, indexes, count=4):
21
+ question_embedding = gpt3_embeddings(question)
22
+
23
+ simmilarities = []
24
+ for index in indexes:
25
+ embedding = index["embedding"]
26
+ score = dot_similarity(question_embedding, embedding)
27
+ simmilarities.append({"index": index, "score": score})
28
+
29
+ sorted_similarities = sorted(
30
+ simmilarities, key=lambda x: x["score"], reverse=True
31
+ )
32
+
33
+ return sorted_similarities[:count]
34
+
35
+
36
+ if __name__ == "__main__":
37
+ with jsonlines.open(Path("./index") / "index.jsonl") as passages:
38
+ indexes = list(passages)
39
+
40
+ while True:
41
+ question = input("User >")
42
+
43
+ search_results = search_index(question=question, indexes=indexes, count=2)
44
+
45
+ answers = []
46
+ for result in search_results:
47
+ print("iterating over answering questions")
48
+
49
+ prompt = (
50
+ load_prompt("prompts\question_answering.txt")
51
+ .replace("<<PASSAGE>>", result["index"]["content"])
52
+ .replace("<<QUESTION>>", question)
53
+ )
54
+
55
+ answer = gpt3_completion(
56
+ prompt=prompt, max_tokens=80, model="text-curie-001"
57
+ )
58
+ answers.append(answer)
59
+
60
+ prompt = load_prompt("prompts\passage_summarization.txt").replace(
61
+ "<<PASSAGE>>", "\n".join(answers)
62
+ )
63
+
64
+ final_answer = gpt3_completion(prompt=prompt)
65
+
66
+ print(f"Bot: {final_answer}")
index/build_index.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ from dotenv import load_dotenv
5
+
6
+ import openai
7
+ import textwrap
8
+ import jsonlines
9
+
10
+ from utils import gpt3_embeddings
11
+
12
+ load_dotenv()
13
+
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
+
16
+ openai.api_key = OPENAI_API_KEY
17
+
18
+ path = Path("./documents")
19
+
20
+
21
+ with open(path / "result.txt", "r") as f:
22
+ lines = f.readlines()
23
+ text = "".join(lines)
24
+ text = re.sub("\s+", " ", text) # white space normalization
25
+
26
+ result = []
27
+
28
+ chunks = textwrap.wrap(text, 4000)
29
+ for chunk in chunks:
30
+ embedding = gpt3_embeddings(chunk)
31
+ info = {"content": chunk, "embedding": embedding}
32
+ result.append(info)
33
+
34
+ result_path = Path("./index")
35
+
36
+ with jsonlines.open(result_path / "index.jsonl", "w") as writer:
37
+ writer.write_all(result)
index/index.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
prompts/passage_summarization.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Summarize the following passage in detail
2
+ passage: <<PASSAGE>>
3
+
4
+ summary:
prompts/question_answering.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Use the passage to write a detailed answer to the following question
2
+
3
+ passage: <<PASSAGE>>
4
+
5
+ question: <<QUESTION>>
6
+
7
+ answer:
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pdfplumber
2
+ textwrap3
3
+ openai
4
+ python-dotenv
5
+ jsonlines
utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import numpy as np
3
+
4
+
5
+ def gpt3_embeddings(text, model="text-similarity-ada-001"):
6
+ text = text.replace("\n", " ")
7
+ embedding = None
8
+ try:
9
+ embedding = openai.Embedding.create(input=[text], model=model)["data"][0][
10
+ "embedding"
11
+ ]
12
+ except Exception as err:
13
+ print(f"Sorry, There was a problem {err}")
14
+
15
+ return embedding
16
+
17
+
18
+ def gpt3_completion(prompt, max_tokens=128, model="text-davinci-003"):
19
+ response = None
20
+ try:
21
+ response = openai.Completion.create(
22
+ model=model,
23
+ prompt=prompt,
24
+ max_tokens=max_tokens,
25
+ )["choices"][0]["text"]
26
+
27
+ except Exception as err:
28
+ print(f"Sorry, There was a problem \n\n {err}")
29
+
30
+ return response
31
+
32
+
33
+ def load_prompt(path):
34
+ with open(path) as f:
35
+ lines = f.readlines()
36
+ return "".join(lines)
37
+
38
+
39
+ def cosine_similarity(emb1, emb2):
40
+ return np.dot(emb1, emb2) / (
41
+ (np.dot(emb1, emb1) ** 0.5) * (np.dot(emb2, emb2) ** 0.5)
42
+ )
43
+
44
+
45
+ def dot_similarity(emb1, emb2):
46
+ return np.dot(emb1, emb2)