Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,9 @@ import json
|
|
7 |
import io
|
8 |
import traceback
|
9 |
import csv
|
|
|
|
|
|
|
10 |
|
11 |
# ์ถ๋ก API ํด๋ผ์ด์ธํธ ์ค์
|
12 |
hf_client = InferenceClient(
|
@@ -31,8 +34,13 @@ def load_parquet(filename: str) -> str:
|
|
31 |
except Exception as e:
|
32 |
return f"ํ์ผ์ ์ฝ๋ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
33 |
|
|
|
|
|
|
|
|
|
|
|
34 |
def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
|
35 |
-
# ์์คํ
ํ๋กฌํํธ
|
36 |
system_prefix = """๋ฐ๋์ ํ๊ธ๋ก ๋ต๋ณํ ๊ฒ. ๋๋ ์
๋ก๋๋ ๋ฐ์ดํฐ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ง๋ฌธ์ ๋ต๋ณํ๋ ์ญํ ์ ํ๋ค.
|
37 |
|
38 |
์ฃผ์ ์ง์นจ:
|
@@ -50,35 +58,36 @@ def respond(message: str, history: List[Dict[str, str]], system_message: str = "
|
|
50 |
system_prefix += f"\n\n๋ฐ์ดํฐ ์์ฝ:\n{data_summary}"
|
51 |
except Exception as e:
|
52 |
print(f"๋ฐ์ดํฐ ๋ก๋ ์ค๋ฅ: {str(e)}")
|
|
|
|
|
|
|
53 |
|
54 |
# ์ต๊ทผ ๋ํ ์ปจํ
์คํธ๋ง ์ ์ง
|
55 |
recent_history = history[-3:] if history else []
|
56 |
-
|
57 |
-
prompt = system_prefix + "\n\n"
|
58 |
for chat in recent_history:
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
prompt += f"AI: {chat['content']}\n"
|
63 |
-
prompt += f"์ฌ์ฉ์: {message}\nAI:"
|
64 |
|
65 |
try:
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
temperature=temperature,
|
72 |
top_p=top_p,
|
73 |
-
|
74 |
)
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
79 |
# ์๋ต ์ ์
|
80 |
-
cleaned_response = clean_response(
|
81 |
yield cleaned_response
|
|
|
82 |
except Exception as e:
|
83 |
error_message = f"์ถ๋ก ์ค๋ฅ: {str(e)}"
|
84 |
print(error_message)
|
@@ -216,6 +225,7 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
|
216 |
print(f"{error_message}\n{traceback.format_exc()}")
|
217 |
return error_message, "", ""
|
218 |
|
|
|
219 |
def preprocess_text_with_llm(input_text: str) -> str:
|
220 |
if not input_text.strip():
|
221 |
return "์
๋ ฅ ํ
์คํธ๊ฐ ๋น์ด์์ต๋๋ค."
|
@@ -232,56 +242,78 @@ def preprocess_text_with_llm(input_text: str) -> str:
|
|
232 |
- Technology (๊ธฐ์ )
|
233 |
- Politics (์ ์น)
|
234 |
- Culture (๋ฌธํ)
|
235 |
-
5. metadata: ๋ ์ง, ์ถ์ฒ ๋ฑ ์ถ๊ฐ ์ ๋ณด
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
-
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
-
|
243 |
-
|
|
|
|
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
try:
|
254 |
-
response =
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
261 |
)
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
else:
|
271 |
-
processed_text = response.strip()
|
272 |
-
|
273 |
-
# ์ค๋ณต ์ถ๋ ฅ ์ ๊ฑฐ
|
274 |
-
lines = processed_text.split('\n')
|
275 |
-
unique_lines = []
|
276 |
-
seen_texts = set()
|
277 |
-
|
278 |
-
for line in lines:
|
279 |
-
line = line.strip()
|
280 |
-
if line and '์ถ๋ ฅ:' not in line and line not in seen_texts:
|
281 |
-
unique_lines.append(line)
|
282 |
-
seen_texts.add(line)
|
283 |
-
|
284 |
-
processed_text = '\n'.join(unique_lines)
|
285 |
|
286 |
# CSV ํ์ ๊ฒ์ฆ
|
287 |
try:
|
@@ -599,3 +631,5 @@ with gr.Blocks(css=css) as demo:
|
|
599 |
if __name__ == "__main__":
|
600 |
demo.launch(share=True)
|
601 |
|
|
|
|
|
|
7 |
import io
|
8 |
import traceback
|
9 |
import csv
|
10 |
+
# HuggingFace ํด๋ผ์ด์ธํธ ๋์ OpenAI ํด๋ผ์ด์ธํธ ์ฌ์ฉ
|
11 |
+
from openai import OpenAI
|
12 |
+
import os
|
13 |
|
14 |
# ์ถ๋ก API ํด๋ผ์ด์ธํธ ์ค์
|
15 |
hf_client = InferenceClient(
|
|
|
34 |
except Exception as e:
|
35 |
return f"ํ์ผ์ ์ฝ๋ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
36 |
|
37 |
+
|
38 |
+
# OpenAI ํด๋ผ์ด์ธํธ ์ค์
|
39 |
+
client = OpenAI(api_key=os.getenv("OPEN_AI"))
|
40 |
+
|
41 |
+
# respond ํจ์ ์์
|
42 |
def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
|
43 |
+
# ์์คํ
ํ๋กฌํํธ ์ค์
|
44 |
system_prefix = """๋ฐ๋์ ํ๊ธ๋ก ๋ต๋ณํ ๊ฒ. ๋๋ ์
๋ก๋๋ ๋ฐ์ดํฐ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ง๋ฌธ์ ๋ต๋ณํ๋ ์ญํ ์ ํ๋ค.
|
45 |
|
46 |
์ฃผ์ ์ง์นจ:
|
|
|
58 |
system_prefix += f"\n\n๋ฐ์ดํฐ ์์ฝ:\n{data_summary}"
|
59 |
except Exception as e:
|
60 |
print(f"๋ฐ์ดํฐ ๋ก๋ ์ค๋ฅ: {str(e)}")
|
61 |
+
|
62 |
+
# ๋ํ ํ์คํ ๋ฆฌ ๊ตฌ์ฑ
|
63 |
+
messages = [{"role": "system", "content": system_prefix}]
|
64 |
|
65 |
# ์ต๊ทผ ๋ํ ์ปจํ
์คํธ๋ง ์ ์ง
|
66 |
recent_history = history[-3:] if history else []
|
|
|
|
|
67 |
for chat in recent_history:
|
68 |
+
messages.append({"role": chat["role"], "content": chat["content"]})
|
69 |
+
|
70 |
+
messages.append({"role": "user", "content": message})
|
|
|
|
|
71 |
|
72 |
try:
|
73 |
+
# OpenAI API ํธ์ถ
|
74 |
+
response = client.chat.completions.create(
|
75 |
+
model="gpt-4-0125-preview", # GPT-4-mini ๋ชจ๋ธ ์ฌ์ฉ
|
76 |
+
messages=messages,
|
77 |
+
max_tokens=max_tokens,
|
78 |
temperature=temperature,
|
79 |
top_p=top_p,
|
80 |
+
stream=True
|
81 |
)
|
82 |
+
|
83 |
+
full_response = ""
|
84 |
+
for chunk in response:
|
85 |
+
if chunk.choices[0].delta.content:
|
86 |
+
full_response += chunk.choices[0].delta.content
|
87 |
# ์๋ต ์ ์
|
88 |
+
cleaned_response = clean_response(full_response)
|
89 |
yield cleaned_response
|
90 |
+
|
91 |
except Exception as e:
|
92 |
error_message = f"์ถ๋ก ์ค๋ฅ: {str(e)}"
|
93 |
print(error_message)
|
|
|
225 |
print(f"{error_message}\n{traceback.format_exc()}")
|
226 |
return error_message, "", ""
|
227 |
|
228 |
+
# preprocess_text_with_llm ํจ์๋ ์์
|
229 |
def preprocess_text_with_llm(input_text: str) -> str:
|
230 |
if not input_text.strip():
|
231 |
return "์
๋ ฅ ํ
์คํธ๊ฐ ๋น์ด์์ต๋๋ค."
|
|
|
242 |
- Technology (๊ธฐ์ )
|
243 |
- Politics (์ ์น)
|
244 |
- Culture (๋ฌธํ)
|
245 |
+
5. metadata: ๋ ์ง, ์ถ์ฒ ๋ฑ ์ถ๊ฐ ์ ๋ณด"""
|
246 |
|
247 |
+
try:
|
248 |
+
response = client.chat.completions.create(
|
249 |
+
model="gpt-4-0125-preview",
|
250 |
+
messages=[
|
251 |
+
{"role": "system", "content": system_prompt},
|
252 |
+
{"role": "user", "content": input_text}
|
253 |
+
],
|
254 |
+
max_tokens=4000,
|
255 |
+
temperature=0.1,
|
256 |
+
stream=True
|
257 |
+
)
|
258 |
|
259 |
+
full_response = ""
|
260 |
+
for chunk in response:
|
261 |
+
if chunk.choices[0].delta.content:
|
262 |
+
full_response += chunk.choices[0].delta.content
|
263 |
|
264 |
+
# ์๋ต ์ ์
|
265 |
+
processed_text = clean_response(full_response)
|
266 |
+
|
267 |
+
# CSV ํ์ ๊ฒ์ฆ
|
268 |
+
try:
|
269 |
+
from io import StringIO
|
270 |
+
import csv
|
271 |
+
csv.reader(StringIO(processed_text))
|
272 |
+
return processed_text
|
273 |
+
except csv.Error:
|
274 |
+
return "LLM์ด ์ฌ๋ฐ๋ฅธ CSV ํ์์ ์์ฑํ์ง ๋ชปํ์ต๋๋ค. ๋ค์ ์๋ํด์ฃผ์ธ์."
|
275 |
+
|
276 |
+
except Exception as e:
|
277 |
+
error_message = f"์ ์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
278 |
+
print(error_message)
|
279 |
+
return error_message# preprocess_text_with_llm ํจ์๋ ์์
|
280 |
+
def preprocess_text_with_llm(input_text: str) -> str:
|
281 |
+
if not input_text.strip():
|
282 |
+
return "์
๋ ฅ ํ
์คํธ๊ฐ ๋น์ด์์ต๋๋ค."
|
283 |
+
|
284 |
+
system_prompt = """๋ฐ๋์ ํ๊ธ(ํ๊ตญ์ด)๋ก ๋ต๋ณํ์์ค. ๋น์ ์ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์ ๋ฌธ๊ฐ์
๋๋ค. ์
๋ ฅ๋ ํ
์คํธ๋ฅผ CSV ๋ฐ์ดํฐ์
ํ์์ผ๋ก ๋ณํํ์ธ์.
|
285 |
|
286 |
+
๊ท์น:
|
287 |
+
1. ์ถ๋ ฅ ํ์: id,text,label,metadata
|
288 |
+
2. id: 1๋ถํฐ ์์ํ๋ ์์ฐจ์ ๋ฒํธ
|
289 |
+
3. text: ์๋ฏธ ์๋ ๋จ์๋ก ๋ถ๋ฆฌ๋ ํ
์คํธ
|
290 |
+
4. label: ํ
์คํธ์ ์ฃผ์ ๋ ์นดํ
๊ณ ๋ฆฌ๋ฅผ ์๋ ๊ธฐ์ค์ผ๋ก ์ ํํ๊ฒ ํ ๊ฐ๋ง ์ ํ
|
291 |
+
- Historical_Figure (์ญ์ฌ์ ์ธ๋ฌผ)
|
292 |
+
- Military_History (๊ตฐ์ฌ ์ญ์ฌ)
|
293 |
+
- Technology (๊ธฐ์ )
|
294 |
+
- Politics (์ ์น)
|
295 |
+
- Culture (๋ฌธํ)
|
296 |
+
5. metadata: ๋ ์ง, ์ถ์ฒ ๋ฑ ์ถ๊ฐ ์ ๋ณด"""
|
297 |
|
298 |
try:
|
299 |
+
response = client.chat.completions.create(
|
300 |
+
model="gpt-4o-mini",
|
301 |
+
messages=[
|
302 |
+
{"role": "system", "content": system_prompt},
|
303 |
+
{"role": "user", "content": input_text}
|
304 |
+
],
|
305 |
+
max_tokens=4000,
|
306 |
+
temperature=0.1,
|
307 |
+
stream=True
|
308 |
)
|
309 |
+
|
310 |
+
full_response = ""
|
311 |
+
for chunk in response:
|
312 |
+
if chunk.choices[0].delta.content:
|
313 |
+
full_response += chunk.choices[0].delta.content
|
314 |
+
|
315 |
+
# ์๋ต ์ ์
|
316 |
+
processed_text = clean_response(full_response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
|
318 |
# CSV ํ์ ๊ฒ์ฆ
|
319 |
try:
|
|
|
631 |
if __name__ == "__main__":
|
632 |
demo.launch(share=True)
|
633 |
|
634 |
+
|
635 |
+
llm ๋ชจ๋ธ ๋ณ๊ฒฝํ๋ผ. openai api๋ฅผ ์ด์ฉํ๊ณ ๋ชจ๋ธ์ gpt-4o-mini๋ก ์ค์ ํ๋ผ. apiํค๋ os.getenv("OPEN_AI")๋ฅผ ์ด์ฉํ๋ผ
|