Commit
ยท
ba2ba98
1
Parent(s):
d32b31c
Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ import os
|
|
13 |
from huggingface_hub import hf_hub_download # Hugging Face Hub์์ ๋ชจ๋ธ์ ๋ค์ด๋ก๋ํ๊ธฐ ์ํ ํจ์์
๋๋ค.
|
14 |
from transformers import pipeline
|
15 |
from io import BytesIO
|
|
|
16 |
|
17 |
# PDF ๋ฌธ์๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์์
๋๋ค.
|
18 |
def get_pdf_text(pdf_docs):
|
@@ -30,16 +31,34 @@ def get_pdf_text(pdf_docs):
|
|
30 |
def get_text_file(docs):
|
31 |
text_list = []
|
32 |
for doc in docs:
|
33 |
-
text = doc.decode('utf-8') # '
|
34 |
text_list.append(text)
|
35 |
return text_list
|
36 |
|
37 |
|
|
|
38 |
def get_csv_file(docs):
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def get_json_file(docs):
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def get_text_chunks(documents):
|
45 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
13 |
from huggingface_hub import hf_hub_download # Hugging Face Hub์์ ๋ชจ๋ธ์ ๋ค์ด๋ก๋ํ๊ธฐ ์ํ ํจ์์
๋๋ค.
|
14 |
from transformers import pipeline
|
15 |
from io import BytesIO
|
16 |
+
import pandas as pd
|
17 |
|
18 |
# PDF ๋ฌธ์๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์์
๋๋ค.
|
19 |
def get_pdf_text(pdf_docs):
|
|
|
31 |
def get_text_file(docs):
|
32 |
text_list = []
|
33 |
for doc in docs:
|
34 |
+
text = doc.getvalue().decode('utf-8') # 'BytesIO' ๊ฐ์ฒด์์ ํ
์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค.
|
35 |
text_list.append(text)
|
36 |
return text_list
|
37 |
|
38 |
|
39 |
+
|
40 |
def get_csv_file(docs):
|
41 |
+
text_list = []
|
42 |
+
for doc in docs:
|
43 |
+
df = pd.read_csv(doc)
|
44 |
+
for column in df.columns:
|
45 |
+
text_list.extend(df[column].astype(str).tolist())
|
46 |
+
return text_list
|
47 |
|
48 |
def get_json_file(docs):
|
49 |
+
text_list = []
|
50 |
+
for doc in docs:
|
51 |
+
json_data = doc.read().decode('utf-8')
|
52 |
+
data = json.loads(json_data)
|
53 |
+
if isinstance(data, dict):
|
54 |
+
text_list.extend(list(data.values()))
|
55 |
+
elif isinstance(data, list):
|
56 |
+
for item in data:
|
57 |
+
if isinstance(item, str):
|
58 |
+
text_list.append(item)
|
59 |
+
elif isinstance(item, dict):
|
60 |
+
text_list.extend(list(item.values()))
|
61 |
+
return text_list
|
62 |
|
63 |
def get_text_chunks(documents):
|
64 |
text_splitter = RecursiveCharacterTextSplitter(
|