ginipick commited on
Commit
b058138
โ€ข
1 Parent(s): 79997b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -117,9 +117,10 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
117
 
118
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
119
  try:
120
- # ํ…์ŠคํŠธ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜ (๊ฐ ํ–‰์€ ์ฝค๋งˆ๋กœ ๊ตฌ๋ถ„)
121
- data = [line.strip().split(',') for line in text.strip().split('\n')]
122
- df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
 
123
  # ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
124
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
125
  # Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
@@ -135,7 +136,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
135
  # LLM์—๊ฒŒ ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜๋„๋ก ์š”์ฒญ
136
  system_prompt = """๋‹น์‹ ์€ ์ž…๋ ฅ๋œ ๊ธด ํ…์ŠคํŠธ๋ฅผ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜๋Š” ์—ญํ• ์„ ํ•ฉ๋‹ˆ๋‹ค.
137
  - ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์€ id,text,label,metadata์ž…๋‹ˆ๋‹ค.
138
- - ๊ฐ ํ–‰์€ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋˜๋ฉฐ, ํ…์ŠคํŠธ ๋‚ด์— ์‰ผํ‘œ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ ์ œ๊ฑฐํ•˜๊ฑฐ๋‚˜ ๋‹ค๋ฅธ ๋ฌธ์ž๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค.
139
  - ํ…์ŠคํŠธ๋ฅผ ์˜๋ฏธ ๋‹จ์œ„๋กœ ๋ถ„ํ• ํ•˜๊ณ , ์ ์ ˆํžˆ ๋ฌธ์žฅ์„ ์žฌ๊ตฌ์„ฑํ•˜๊ณ  ํŽธ์ง‘ํ•˜์—ฌ ์ตœ์ ํ™”๋œ ๋ฌธ์žฅ์œผ๋กœ ๋งŒ๋“ญ๋‹ˆ๋‹ค.
140
  - ๊ฐ ๋ฌธ์žฅ์— ๋Œ€ํ•ด id๋ฅผ ๋ถ€์—ฌํ•˜๊ณ , ์ ์ ˆํ•œ label(์นดํ…Œ๊ณ ๋ฆฌ)์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
141
  - metadata์—๋Š” ์ถœ์ฒ˜๋‚˜ ๋‚ ์งœ ๋“ฑ์˜ ์ถ”๊ฐ€ ์ •๋ณด๋ฅผ ํฌํ•จํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
 
117
 
118
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
119
  try:
120
+ from io import StringIO
121
+ # CSV ๋ฐ์ดํ„ฐ๋ฅผ StringIO๋ฅผ ํ†ตํ•ด ์ฝ๊ธฐ
122
+ csv_data = StringIO(text)
123
+ df = pd.read_csv(csv_data, sep=',', dtype=str)
124
  # ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
125
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
126
  # Parquet ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
 
136
  # LLM์—๊ฒŒ ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜๋„๋ก ์š”์ฒญ
137
  system_prompt = """๋‹น์‹ ์€ ์ž…๋ ฅ๋œ ๊ธด ํ…์ŠคํŠธ๋ฅผ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜๋Š” ์—ญํ• ์„ ํ•ฉ๋‹ˆ๋‹ค.
138
  - ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์€ id,text,label,metadata์ž…๋‹ˆ๋‹ค.
139
+ - ๊ฐ ํ–‰์€ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋˜๋ฉฐ, **ํ…์ŠคํŠธ๋‚˜ ๋‹ค๋ฅธ ํ•„๋“œ ๋‚ด์— ์‰ผํ‘œ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ ํ•ด๋‹น ํ•„๋“œ๋ฅผ ํฐ๋”ฐ์˜ดํ‘œ(")๋กœ ๊ฐ์Œ‰๋‹ˆ๋‹ค.**
140
  - ํ…์ŠคํŠธ๋ฅผ ์˜๋ฏธ ๋‹จ์œ„๋กœ ๋ถ„ํ• ํ•˜๊ณ , ์ ์ ˆํžˆ ๋ฌธ์žฅ์„ ์žฌ๊ตฌ์„ฑํ•˜๊ณ  ํŽธ์ง‘ํ•˜์—ฌ ์ตœ์ ํ™”๋œ ๋ฌธ์žฅ์œผ๋กœ ๋งŒ๋“ญ๋‹ˆ๋‹ค.
141
  - ๊ฐ ๋ฌธ์žฅ์— ๋Œ€ํ•ด id๋ฅผ ๋ถ€์—ฌํ•˜๊ณ , ์ ์ ˆํ•œ label(์นดํ…Œ๊ณ ๋ฆฌ)์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
142
  - metadata์—๋Š” ์ถœ์ฒ˜๋‚˜ ๋‚ ์งœ ๋“ฑ์˜ ์ถ”๊ฐ€ ์ •๋ณด๋ฅผ ํฌํ•จํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.