[email protected]
commited on
Commit
ยท
333bd63
1
Parent(s):
a7f4e5b
edit codes
Browse files
app.py
CHANGED
@@ -31,22 +31,22 @@ def get_pdf_text(pdf_docs):
|
|
31 |
# ์๋ ํ
์คํธ ์ถ์ถ ํจ์๋ฅผ ์์ฑ
|
32 |
|
33 |
def get_text_file(docs):
|
34 |
-
|
35 |
-
|
36 |
-
with open(
|
37 |
f.write(docs.getvalue()) # TXT ๋ฌธ์์ ๋ด์ฉ์ ์์ ํ์ผ์ ์๋๋ค.
|
38 |
-
with open(
|
39 |
txt_doc = txt_file.read() # ํ
์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค.
|
40 |
return txt_doc # ์ถ์ถํ ํ
์คํธ๋ฅผ ๋ฐํํฉ๋๋ค.
|
41 |
|
42 |
def get_csv_file(docs):
|
43 |
-
|
44 |
-
|
45 |
-
with open(
|
46 |
f.write(docs.getvalue())
|
47 |
|
48 |
csv_data = []
|
49 |
-
with open(
|
50 |
csv_reader = csv.reader(csv_file)
|
51 |
for row in csv_reader:
|
52 |
csv_data.append(row)
|
@@ -54,14 +54,16 @@ def get_csv_file(docs):
|
|
54 |
return csv_data
|
55 |
|
56 |
def get_json_file(docs):
|
57 |
-
|
58 |
-
|
59 |
-
with open(
|
60 |
f.write(docs.getvalue())
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
65 |
return json_data
|
66 |
|
67 |
|
@@ -72,17 +74,9 @@ def get_text_chunks(documents):
|
|
72 |
chunk_overlap=200, # ์ฒญํฌ ์ฌ์ด์ ์ค๋ณต์ ์ง์ ํฉ๋๋ค.
|
73 |
length_function=len # ํ
์คํธ์ ๊ธธ์ด๋ฅผ ์ธก์ ํ๋ ํจ์๋ฅผ ์ง์ ํฉ๋๋ค.
|
74 |
)
|
75 |
-
texts = []
|
76 |
-
for doc in documents:
|
77 |
-
if isinstance(doc, str):
|
78 |
-
# doc์ด ๋ฌธ์์ด์ธ ๊ฒฝ์ฐ ์ง์ texts์ ์ถ๊ฐ
|
79 |
-
texts.append(doc)
|
80 |
-
else:
|
81 |
-
# doc์ด 'page_content' ์์ฑ์ ๊ฐ์ถ ๊ฐ์ฒด์ธ ๊ฒฝ์ฐ
|
82 |
-
texts.append(doc.page_content)
|
83 |
|
84 |
-
|
85 |
-
return
|
86 |
|
87 |
|
88 |
# ํ
์คํธ ์ฒญํฌ๋ค๋ก๋ถํฐ ๋ฒกํฐ ์คํ ์ด๋ฅผ ์์ฑํ๋ ํจ์์
๋๋ค.
|
|
|
31 |
# ์๋ ํ
์คํธ ์ถ์ถ ํจ์๋ฅผ ์์ฑ
|
32 |
|
33 |
def get_text_file(docs):
|
34 |
+
temp_dir2 = tempfile.TemporaryDirectory() # ์์ ๋๋ ํ ๋ฆฌ๋ฅผ ์์ฑํฉ๋๋ค.
|
35 |
+
temp_filepath2 = os.path.join(temp_dir2.name, docs.name) # ์์ ํ์ผ ๊ฒฝ๋ก๋ฅผ ์์ฑํฉ๋๋ค.
|
36 |
+
with open(temp_filepath2, "wb") as f: # ์์ ํ์ผ์ ๋ฐ์ด๋๋ฆฌ ์ฐ๊ธฐ ๋ชจ๋๋ก ์ฝ๋๋ค.
|
37 |
f.write(docs.getvalue()) # TXT ๋ฌธ์์ ๋ด์ฉ์ ์์ ํ์ผ์ ์๋๋ค.
|
38 |
+
with open(temp_filepath2, "r") as txt_file: # TXT ํ์ผ์ ๋ฐ์ด๋๋ฆฌ ์ฐ๊ธฐ ๋ชจ๋๋ก ์ฝ๋๋ค.
|
39 |
txt_doc = txt_file.read() # ํ
์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค.
|
40 |
return txt_doc # ์ถ์ถํ ํ
์คํธ๋ฅผ ๋ฐํํฉ๋๋ค.
|
41 |
|
42 |
def get_csv_file(docs):
|
43 |
+
temp_dir3 = tempfile.TemporaryDirectory()
|
44 |
+
temp_filepath3 = os.path.join(temp_dir3.name, docs.name)
|
45 |
+
with open(temp_filepath3, "wb") as f:
|
46 |
f.write(docs.getvalue())
|
47 |
|
48 |
csv_data = []
|
49 |
+
with open(temp_filepath3, "r") as csv_file:
|
50 |
csv_reader = csv.reader(csv_file)
|
51 |
for row in csv_reader:
|
52 |
csv_data.append(row)
|
|
|
54 |
return csv_data
|
55 |
|
56 |
def get_json_file(docs):
|
57 |
+
temp_dir4 = tempfile.TemporaryDirectory()
|
58 |
+
temp_filepath4 = os.path.join(temp_dir4.name, docs.name)
|
59 |
+
with open(temp_filepath4, "wb") as f:
|
60 |
f.write(docs.getvalue())
|
61 |
+
json_loader = JSONLoader(
|
62 |
+
file_path=temp_filepath4,
|
63 |
+
jq_schema='.messages[].content',
|
64 |
+
text_content=False
|
65 |
+
)
|
66 |
+
json_data = json_loader.load()
|
67 |
return json_data
|
68 |
|
69 |
|
|
|
74 |
chunk_overlap=200, # ์ฒญํฌ ์ฌ์ด์ ์ค๋ณต์ ์ง์ ํฉ๋๋ค.
|
75 |
length_function=len # ํ
์คํธ์ ๊ธธ์ด๋ฅผ ์ธก์ ํ๋ ํจ์๋ฅผ ์ง์ ํฉ๋๋ค.
|
76 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
documents = text_splitter.split_documents(documents) # ๋ฌธ์๋ค์ ์ฒญํฌ๋ก ๋๋๋๋ค
|
79 |
+
return documents # ๋๋ ์ฒญํฌ๋ฅผ ๋ฐํํฉ๋๋ค.
|
80 |
|
81 |
|
82 |
# ํ
์คํธ ์ฒญํฌ๋ค๋ก๋ถํฐ ๋ฒกํฐ ์คํ ์ด๋ฅผ ์์ฑํ๋ ํจ์์
๋๋ค.
|