File size: 2,260 Bytes
192dc63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc9be2
 
 
 
192dc63
 
 
cdc9be2
192dc63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#create all embeddings, combine all answers and create a merged DB

import os 
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 
import time

from lc_base.chain import openai_chain
from lc_base.database import Data   
from lc_base.logs import save_log

# Store all reports into input_dir and the generated DB for all reports will be saved in output_dir
search_type = "stuff" #map_reduce, stuff
model_type = "gpt-4-1106-preview" #gpt-3.5-turbo, gpt-4-1106-preview
top_k = 70

input_dir = os.path.join("inputs", "policy")
output_dir = os.path.join("outputs", "faiss", "policy_eausa_stuff70")
combined_dir = os.path.join("outputs", "combined", "policy__eausa_stuff70", "faiss_index")

default_query = '''
Please generate a comprehensive summary of this document.
Ensure the summary is presented in a formal style, and if there are any contradictions or variations in the findings, 
address them appropriately. The summary should be approximately 1/6 of your input capacity and can be structured in paragraphs or bullet points.
'''

data = Data(inp_dir=input_dir, out_dir=output_dir)
data.check_output()
data.get_faiss_embeddings()

list_dir = os.listdir(output_dir)

comb_response = ''

for dir in list_dir:
    path = os.path.join(output_dir, dir, 'faiss_index')
    chain = openai_chain(inp_dir=path)
    
    print('Getting reponse for ' + str(dir))
    query = default_query
    start_time = time.time()
    response = chain.get_response(query, k=top_k, type=search_type, model_name=model_type)
    print(response)
    time_taken = time.time() - start_time
    save_log(file_path='logs/combined_policy.csv', query=query, response=response, model_name=model_type, 
             time_taken=time_taken, inp=input_dir, data=dir)
    comb_response += str(response)

# Split the texts 
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, 
    length_function = len,
)
texts = text_splitter.split_text(comb_response)

# Initialize OPENAI embeddings
embedding = OpenAIEmbeddings()

# Create Embedding
db = FAISS.from_texts(texts, embedding)

# Save Embedding
db.save_local(combined_dir)