ginipick commited on
Commit
ffb5b8d
โ€ข
1 Parent(s): 12e6818

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -11
app.py CHANGED
@@ -119,29 +119,62 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
119
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
120
  try:
121
  from io import StringIO
122
- # CSV ๋ฐ์ดํ„ฐ๋ฅผ StringIO๋ฅผ ํ†ตํ•ด ์ฝ๊ธฐ
123
- csv_data = StringIO(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  df = pd.read_csv(
125
- csv_data,
126
  sep=',',
127
- dtype=str,
128
- quoting=csv.QUOTE_ALL, # ๋ชจ๋“  ํ•„๋“œ๋ฅผ ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ธ๋Š” ๊ฒƒ์œผ๋กœ ์ฒ˜๋ฆฌ
129
- escapechar='\\', # ์ด์Šค์ผ€์ดํ”„ ๋ฌธ์ž ์„ค์ •
130
- engine='python', # Python ์—”์ง„ ์‚ฌ์šฉ
131
- header=None, # ์ฒซ ๋ฒˆ์งธ ํ–‰์„ ์—ด ์ด๋ฆ„์œผ๋กœ ์‚ฌ์šฉํ•˜์ง€ ์•Š์Œ
132
- names=['id', 'text', 'label', 'metadata'] # ์—ด ์ด๋ฆ„ ์ง€์ •
133
  )
 
134
  # ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
135
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
 
136
  # Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
137
  parquet_filename = 'text_to_parquet.parquet'
138
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
 
139
  # Parquet ํŒŒ์ผ ๋‚ด์šฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
140
  parquet_content = load_parquet(parquet_filename)
 
141
  return f"{parquet_filename} ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ๋ณ€ํ™˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", parquet_content, parquet_filename
 
142
  except Exception as e:
143
- error_message = f"ํ…์ŠคํŠธ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}\n{traceback.format_exc()}"
144
- print(error_message)
145
  return error_message, "", ""
146
 
147
  def preprocess_text_with_llm(input_text: str) -> str:
 
119
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
120
  try:
121
  from io import StringIO
122
+ import csv
123
+
124
+ # ์ž…๋ ฅ ํ…์ŠคํŠธ ์ •์ œ
125
+ lines = text.strip().split('\n')
126
+ cleaned_lines = []
127
+
128
+ for line in lines:
129
+ # ๋นˆ ์ค„ ๊ฑด๋„ˆ๋›ฐ๊ธฐ
130
+ if not line.strip():
131
+ continue
132
+
133
+ # ์Œ๋”ฐ์˜ดํ‘œ ์ •๊ทœํ™”
134
+ line = line.replace('""', '"') # ์ค‘๋ณต ์Œ๋”ฐ์˜ดํ‘œ ์ฒ˜๋ฆฌ
135
+
136
+ # CSV ํŒŒ์‹ฑ์„ ์œ„ํ•œ ์ž„์‹œ StringIO ๊ฐ์ฒด ์ƒ์„ฑ
137
+ temp_buffer = StringIO(line)
138
+ try:
139
+ # CSV ๋ผ์ธ ํŒŒ์‹ฑ ์‹œ๋„
140
+ reader = csv.reader(temp_buffer, quoting=csv.QUOTE_ALL)
141
+ parsed_line = next(reader)
142
+ if len(parsed_line) == 4: # id, text, label, metadata
143
+ # ๊ฐ ํ•„๋“œ๋ฅผ ์ ์ ˆํžˆ ํฌ๋งทํŒ…
144
+ formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
145
+ cleaned_lines.append(formatted_line)
146
+ except:
147
+ continue
148
+ finally:
149
+ temp_buffer.close()
150
+
151
+ # ์ •์ œ๋œ CSV ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
152
+ cleaned_csv = '\n'.join(cleaned_lines)
153
+
154
+ # DataFrame ์ƒ์„ฑ
155
  df = pd.read_csv(
156
+ StringIO(cleaned_csv),
157
  sep=',',
158
+ quoting=csv.QUOTE_ALL,
159
+ escapechar='\\',
160
+ names=['id', 'text', 'label', 'metadata']
 
 
 
161
  )
162
+
163
  # ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
164
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
165
+
166
  # Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
167
  parquet_filename = 'text_to_parquet.parquet'
168
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
169
+
170
  # Parquet ํŒŒ์ผ ๋‚ด์šฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
171
  parquet_content = load_parquet(parquet_filename)
172
+
173
  return f"{parquet_filename} ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ๋ณ€ํ™˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", parquet_content, parquet_filename
174
+
175
  except Exception as e:
176
+ error_message = f"ํ…์ŠคํŠธ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
177
+ print(f"{error_message}\n{traceback.format_exc()}")
178
  return error_message, "", ""
179
 
180
  def preprocess_text_with_llm(input_text: str) -> str: