File size: 759 Bytes
b678100
 
 
 
 
 
 
 
 
5a1b165
b678100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
import re
from pathlib import Path
from dotenv import load_dotenv

import openai
import textwrap
import jsonlines

from src.utils import gpt3_embeddings

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

openai.api_key = OPENAI_API_KEY

path = Path("./documents")


with open(path / "result.txt", "r") as f:
    lines = f.readlines()
    text = "".join(lines)
    text = re.sub("\s+", " ", text)  # white space normalization

result = []

chunks = textwrap.wrap(text, 4000)
for chunk in chunks:
    embedding = gpt3_embeddings(chunk)
    info = {"content": chunk, "embedding": embedding}
    result.append(info)

result_path = Path("./index")

with jsonlines.open(result_path / "index.jsonl", "w") as writer:
    writer.write_all(result)