Spaces:
No application file
No application file
'prototype'
Browse files- chat.py +66 -0
- index/build_index.py +37 -0
- index/index.jsonl +0 -0
- prompts/passage_summarization.txt +4 -0
- prompts/question_answering.txt +7 -0
- requirements.txt +5 -0
- utils.py +46 -0
chat.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import jsonlines
|
5 |
+
from pathlib import Path
|
6 |
+
from utils import (
|
7 |
+
gpt3_embeddings,
|
8 |
+
gpt3_completion,
|
9 |
+
dot_similarity,
|
10 |
+
load_prompt,
|
11 |
+
)
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
16 |
+
|
17 |
+
openai.api_key = OPENAI_API_KEY
|
18 |
+
|
19 |
+
|
20 |
+
def search_index(question, indexes, count=4):
|
21 |
+
question_embedding = gpt3_embeddings(question)
|
22 |
+
|
23 |
+
simmilarities = []
|
24 |
+
for index in indexes:
|
25 |
+
embedding = index["embedding"]
|
26 |
+
score = dot_similarity(question_embedding, embedding)
|
27 |
+
simmilarities.append({"index": index, "score": score})
|
28 |
+
|
29 |
+
sorted_similarities = sorted(
|
30 |
+
simmilarities, key=lambda x: x["score"], reverse=True
|
31 |
+
)
|
32 |
+
|
33 |
+
return sorted_similarities[:count]
|
34 |
+
|
35 |
+
|
36 |
+
if __name__ == "__main__":
|
37 |
+
with jsonlines.open(Path("./index") / "index.jsonl") as passages:
|
38 |
+
indexes = list(passages)
|
39 |
+
|
40 |
+
while True:
|
41 |
+
question = input("User >")
|
42 |
+
|
43 |
+
search_results = search_index(question=question, indexes=indexes, count=2)
|
44 |
+
|
45 |
+
answers = []
|
46 |
+
for result in search_results:
|
47 |
+
print("iterating over answering questions")
|
48 |
+
|
49 |
+
prompt = (
|
50 |
+
load_prompt("prompts\question_answering.txt")
|
51 |
+
.replace("<<PASSAGE>>", result["index"]["content"])
|
52 |
+
.replace("<<QUESTION>>", question)
|
53 |
+
)
|
54 |
+
|
55 |
+
answer = gpt3_completion(
|
56 |
+
prompt=prompt, max_tokens=80, model="text-curie-001"
|
57 |
+
)
|
58 |
+
answers.append(answer)
|
59 |
+
|
60 |
+
prompt = load_prompt("prompts\passage_summarization.txt").replace(
|
61 |
+
"<<PASSAGE>>", "\n".join(answers)
|
62 |
+
)
|
63 |
+
|
64 |
+
final_answer = gpt3_completion(prompt=prompt)
|
65 |
+
|
66 |
+
print(f"Bot: {final_answer}")
|
index/build_index.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from pathlib import Path
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
import openai
|
7 |
+
import textwrap
|
8 |
+
import jsonlines
|
9 |
+
|
10 |
+
from utils import gpt3_embeddings
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
15 |
+
|
16 |
+
openai.api_key = OPENAI_API_KEY
|
17 |
+
|
18 |
+
path = Path("./documents")
|
19 |
+
|
20 |
+
|
21 |
+
with open(path / "result.txt", "r") as f:
|
22 |
+
lines = f.readlines()
|
23 |
+
text = "".join(lines)
|
24 |
+
text = re.sub("\s+", " ", text) # white space normalization
|
25 |
+
|
26 |
+
result = []
|
27 |
+
|
28 |
+
chunks = textwrap.wrap(text, 4000)
|
29 |
+
for chunk in chunks:
|
30 |
+
embedding = gpt3_embeddings(chunk)
|
31 |
+
info = {"content": chunk, "embedding": embedding}
|
32 |
+
result.append(info)
|
33 |
+
|
34 |
+
result_path = Path("./index")
|
35 |
+
|
36 |
+
with jsonlines.open(result_path / "index.jsonl", "w") as writer:
|
37 |
+
writer.write_all(result)
|
index/index.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
prompts/passage_summarization.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Summarize the following passage in detail
|
2 |
+
passage: <<PASSAGE>>
|
3 |
+
|
4 |
+
summary:
|
prompts/question_answering.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Use the passage to write a detailed answer to the following question
|
2 |
+
|
3 |
+
passage: <<PASSAGE>>
|
4 |
+
|
5 |
+
question: <<QUESTION>>
|
6 |
+
|
7 |
+
answer:
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pdfplumber
|
2 |
+
textwrap3
|
3 |
+
openai
|
4 |
+
python-dotenv
|
5 |
+
jsonlines
|
utils.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def gpt3_embeddings(text, model="text-similarity-ada-001"):
|
6 |
+
text = text.replace("\n", " ")
|
7 |
+
embedding = None
|
8 |
+
try:
|
9 |
+
embedding = openai.Embedding.create(input=[text], model=model)["data"][0][
|
10 |
+
"embedding"
|
11 |
+
]
|
12 |
+
except Exception as err:
|
13 |
+
print(f"Sorry, There was a problem {err}")
|
14 |
+
|
15 |
+
return embedding
|
16 |
+
|
17 |
+
|
18 |
+
def gpt3_completion(prompt, max_tokens=128, model="text-davinci-003"):
|
19 |
+
response = None
|
20 |
+
try:
|
21 |
+
response = openai.Completion.create(
|
22 |
+
model=model,
|
23 |
+
prompt=prompt,
|
24 |
+
max_tokens=max_tokens,
|
25 |
+
)["choices"][0]["text"]
|
26 |
+
|
27 |
+
except Exception as err:
|
28 |
+
print(f"Sorry, There was a problem \n\n {err}")
|
29 |
+
|
30 |
+
return response
|
31 |
+
|
32 |
+
|
33 |
+
def load_prompt(path):
|
34 |
+
with open(path) as f:
|
35 |
+
lines = f.readlines()
|
36 |
+
return "".join(lines)
|
37 |
+
|
38 |
+
|
39 |
+
def cosine_similarity(emb1, emb2):
|
40 |
+
return np.dot(emb1, emb2) / (
|
41 |
+
(np.dot(emb1, emb1) ** 0.5) * (np.dot(emb2, emb2) ** 0.5)
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
def dot_similarity(emb1, emb2):
|
46 |
+
return np.dot(emb1, emb2)
|