Update app.py
Browse files
app.py
CHANGED
@@ -119,29 +119,62 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
|
|
119 |
def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
120 |
try:
|
121 |
from io import StringIO
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
df = pd.read_csv(
|
125 |
-
|
126 |
sep=',',
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
engine='python', # Python ์์ง ์ฌ์ฉ
|
131 |
-
header=None, # ์ฒซ ๋ฒ์งธ ํ์ ์ด ์ด๋ฆ์ผ๋ก ์ฌ์ฉํ์ง ์์
|
132 |
-
names=['id', 'text', 'label', 'metadata'] # ์ด ์ด๋ฆ ์ง์
|
133 |
)
|
|
|
134 |
# ๋ฐ์ดํฐ ์ ํ ์ต์ ํ
|
135 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
|
|
|
136 |
# Parquet ํ์ผ๋ก ๋ณํ
|
137 |
parquet_filename = 'text_to_parquet.parquet'
|
138 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
|
|
139 |
# Parquet ํ์ผ ๋ด์ฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
|
140 |
parquet_content = load_parquet(parquet_filename)
|
|
|
141 |
return f"{parquet_filename} ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ๋ณํ๋์์ต๋๋ค.", parquet_content, parquet_filename
|
|
|
142 |
except Exception as e:
|
143 |
-
error_message = f"ํ
์คํธ ๋ณํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}
|
144 |
-
print(error_message)
|
145 |
return error_message, "", ""
|
146 |
|
147 |
def preprocess_text_with_llm(input_text: str) -> str:
|
|
|
119 |
def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
120 |
try:
|
121 |
from io import StringIO
|
122 |
+
import csv
|
123 |
+
|
124 |
+
# ์
๋ ฅ ํ
์คํธ ์ ์
|
125 |
+
lines = text.strip().split('\n')
|
126 |
+
cleaned_lines = []
|
127 |
+
|
128 |
+
for line in lines:
|
129 |
+
# ๋น ์ค ๊ฑด๋๋ฐ๊ธฐ
|
130 |
+
if not line.strip():
|
131 |
+
continue
|
132 |
+
|
133 |
+
# ์๋ฐ์ดํ ์ ๊ทํ
|
134 |
+
line = line.replace('""', '"') # ์ค๋ณต ์๋ฐ์ดํ ์ฒ๋ฆฌ
|
135 |
+
|
136 |
+
# CSV ํ์ฑ์ ์ํ ์์ StringIO ๊ฐ์ฒด ์์ฑ
|
137 |
+
temp_buffer = StringIO(line)
|
138 |
+
try:
|
139 |
+
# CSV ๋ผ์ธ ํ์ฑ ์๋
|
140 |
+
reader = csv.reader(temp_buffer, quoting=csv.QUOTE_ALL)
|
141 |
+
parsed_line = next(reader)
|
142 |
+
if len(parsed_line) == 4: # id, text, label, metadata
|
143 |
+
# ๊ฐ ํ๋๋ฅผ ์ ์ ํ ํฌ๋งทํ
|
144 |
+
formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
|
145 |
+
cleaned_lines.append(formatted_line)
|
146 |
+
except:
|
147 |
+
continue
|
148 |
+
finally:
|
149 |
+
temp_buffer.close()
|
150 |
+
|
151 |
+
# ์ ์ ๋ CSV ๋ฐ์ดํฐ ์์ฑ
|
152 |
+
cleaned_csv = '\n'.join(cleaned_lines)
|
153 |
+
|
154 |
+
# DataFrame ์์ฑ
|
155 |
df = pd.read_csv(
|
156 |
+
StringIO(cleaned_csv),
|
157 |
sep=',',
|
158 |
+
quoting=csv.QUOTE_ALL,
|
159 |
+
escapechar='\\',
|
160 |
+
names=['id', 'text', 'label', 'metadata']
|
|
|
|
|
|
|
161 |
)
|
162 |
+
|
163 |
# ๋ฐ์ดํฐ ์ ํ ์ต์ ํ
|
164 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
|
165 |
+
|
166 |
# Parquet ํ์ผ๋ก ๋ณํ
|
167 |
parquet_filename = 'text_to_parquet.parquet'
|
168 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
169 |
+
|
170 |
# Parquet ํ์ผ ๋ด์ฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
|
171 |
parquet_content = load_parquet(parquet_filename)
|
172 |
+
|
173 |
return f"{parquet_filename} ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ๋ณํ๋์์ต๋๋ค.", parquet_content, parquet_filename
|
174 |
+
|
175 |
except Exception as e:
|
176 |
+
error_message = f"ํ
์คํธ ๋ณํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
177 |
+
print(f"{error_message}\n{traceback.format_exc()}")
|
178 |
return error_message, "", ""
|
179 |
|
180 |
def preprocess_text_with_llm(input_text: str) -> str:
|