Hantr commited on
Commit
84d8fdc
ยท
1 Parent(s): 8f64ec9
Files changed (1) hide show
  1. app.py +19 -5
app.py CHANGED
@@ -13,7 +13,7 @@ from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loadin
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
  import tempfile # ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ธฐ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค.
15
  import os
16
-
17
 
18
  # PDF ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
19
  def get_pdf_text(pdf_docs):
@@ -25,9 +25,6 @@ def get_pdf_text(pdf_docs):
25
  pdf_doc = pdf_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
26
  return pdf_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
27
 
28
- # ๊ณผ์ œ
29
- # ์•„๋ž˜ ํ…์ŠคํŠธ ์ถ”์ถœ ํ•จ์ˆ˜๋ฅผ ์ž‘์„ฑ
30
-
31
 
32
  def get_text_file(txt_docs):
33
  temp_dir = tempfile.TemporaryDirectory()
@@ -78,7 +75,24 @@ def get_vectorstore(text_chunks):
78
  # OpenAI ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค. (Embedding models - Ada v2)
79
 
80
  embeddings = OpenAIEmbeddings()
81
- vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  return vectorstore # ์ƒ์„ฑ๋œ ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
84
 
 
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
  import tempfile # ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ธฐ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค.
15
  import os
16
+ import numpy as np
17
 
18
  # PDF ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
19
  def get_pdf_text(pdf_docs):
 
25
  pdf_doc = pdf_loader.load() # ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
26
  return pdf_doc # ์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
27
 
 
 
 
28
 
29
  def get_text_file(txt_docs):
30
  temp_dir = tempfile.TemporaryDirectory()
 
75
  # OpenAI ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค. (Embedding models - Ada v2)
76
 
77
  embeddings = OpenAIEmbeddings()
78
+
79
+ chunk_embeddings = []
80
+ for chunk in text_chunks:
81
+ chunk_embedding = embeddings.encode(chunk)
82
+ chunk_embeddings.append(chunk_embedding)
83
+
84
+ # FAISS์— ์ ํ•ฉํ•œ ํ˜•ํƒœ๋กœ ๋ฒกํ„ฐ๋“ค์„ ์žฌ๊ตฌ์„ฑํ•ฉ๋‹ˆ๋‹ค.
85
+ flat_embeddings = [emb for chunk in chunk_embeddings for emb in chunk]
86
+
87
+ # ์ž„๋ฒ ๋”ฉ ์ฐจ์› ์ˆ˜๋ฅผ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
88
+ num_dims = len(flat_embeddings[0]) if flat_embeddings else 0
89
+
90
+ # FAISS์— ์ „๋‹ฌํ•  ์ˆ˜ ์žˆ๋Š” ํ˜•ํƒœ๋กœ ๋ฒกํ„ฐ๋“ค์„ ์žฌ์ •๋ ฌํ•ฉ๋‹ˆ๋‹ค.
91
+ vectors = np.array(flat_embeddings).astype('float32')
92
+ vectors = vectors.reshape(len(flat_embeddings), num_dims)
93
+
94
+ # FAISS ์ธ๋ฑ์Šค๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
95
+ vectorstore = FAISS.from_numpy(vectors)
96
 
97
  return vectorstore # ์ƒ์„ฑ๋œ ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
98