ginipick commited on
Commit
2caf879
โ€ข
1 Parent(s): 571a14d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -62
app.py CHANGED
@@ -7,6 +7,9 @@ import json
7
  import io
8
  import traceback
9
  import csv
 
 
 
10
 
11
  # ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
12
  hf_client = InferenceClient(
@@ -31,8 +34,13 @@ def load_parquet(filename: str) -> str:
31
  except Exception as e:
32
  return f"ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
33
 
 
 
 
 
 
34
  def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
35
- # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ฐ•ํ™”
36
  system_prefix = """๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€๋กœ ๋‹ต๋ณ€ํ•  ๊ฒƒ. ๋„ˆ๋Š” ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜๋Š” ์—ญํ• ์„ ํ•œ๋‹ค.
37
 
38
  ์ฃผ์š” ์ง€์นจ:
@@ -50,35 +58,36 @@ def respond(message: str, history: List[Dict[str, str]], system_message: str = "
50
  system_prefix += f"\n\n๋ฐ์ดํ„ฐ ์š”์•ฝ:\n{data_summary}"
51
  except Exception as e:
52
  print(f"๋ฐ์ดํ„ฐ ๋กœ๋“œ ์˜ค๋ฅ˜: {str(e)}")
 
 
 
53
 
54
  # ์ตœ๊ทผ ๋Œ€ํ™” ์ปจํ…์ŠคํŠธ๋งŒ ์œ ์ง€
55
  recent_history = history[-3:] if history else []
56
-
57
- prompt = system_prefix + "\n\n"
58
  for chat in recent_history:
59
- if chat['role'] == 'user':
60
- prompt += f"์‚ฌ์šฉ์ž: {chat['content']}\n"
61
- else:
62
- prompt += f"AI: {chat['content']}\n"
63
- prompt += f"์‚ฌ์šฉ์ž: {message}\nAI:"
64
 
65
  try:
66
- response = ""
67
- stream = hf_client.text_generation(
68
- prompt=prompt,
69
- max_new_tokens=max_tokens,
70
- stream=True,
71
  temperature=temperature,
72
  top_p=top_p,
73
- repetition_penalty=1.2
74
  )
75
-
76
- for msg in stream:
77
- if msg:
78
- response += msg
 
79
  # ์‘๋‹ต ์ •์ œ
80
- cleaned_response = clean_response(response)
81
  yield cleaned_response
 
82
  except Exception as e:
83
  error_message = f"์ถ”๋ก  ์˜ค๋ฅ˜: {str(e)}"
84
  print(error_message)
@@ -216,6 +225,7 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
216
  print(f"{error_message}\n{traceback.format_exc()}")
217
  return error_message, "", ""
218
 
 
219
  def preprocess_text_with_llm(input_text: str) -> str:
220
  if not input_text.strip():
221
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
@@ -232,56 +242,78 @@ def preprocess_text_with_llm(input_text: str) -> str:
232
  - Technology (๊ธฐ์ˆ )
233
  - Politics (์ •์น˜)
234
  - Culture (๋ฌธํ™”)
235
- 5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด
236
 
237
- ์ค‘์š”:
238
- - ๋™์ผํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜๋ณตํ•ด์„œ ์ถœ๋ ฅํ•˜์ง€ ๋ง ๊ฒƒ
239
- - ๊ฐ ํ…์ŠคํŠธ๋Š” ํ•œ ๋ฒˆ๋งŒ ์ฒ˜๋ฆฌํ•˜์—ฌ ๊ฐ€์žฅ ์ ํ•ฉํ•œ label์„ ์„ ํƒํ•  ๊ฒƒ
240
- - ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์˜๋ฏธ ๋‹จ์œ„๋กœ ์ ์ ˆํžˆ ๋ถ„๋ฆฌํ•  ๊ฒƒ
 
 
 
 
 
 
 
241
 
242
- ์˜ˆ์‹œ:
243
- 1,"์ด์ˆœ์‹ ์€ ์กฐ์„  ์ค‘๊ธฐ์˜ ๋ฌด์‹ ์ด๋‹ค.","Historical_Figure","์กฐ์„ ์‹œ๋Œ€, ์œ„ํ‚ค๋ฐฑ๊ณผ"
 
 
244
 
245
- ์ฃผ์˜์‚ฌํ•ญ:
246
- - text์— ์‰ผํ‘œ๊ฐ€ ์žˆ์œผ๋ฉด ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ธ๊ธฐ
247
- - ํฐ๋”ฐ์˜ดํ‘œ๋Š” ๋ฐฑ์Šฌ๋ž˜์‹œ๋กœ ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌ
248
- - ๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„
249
- - ๋ถˆํ•„์š”ํ•œ ๋ฐ˜๋ณต ์ถœ๋ ฅ ๊ธˆ์ง€"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- full_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{input_text}\n\n์ถœ๋ ฅ:"
 
 
 
 
 
 
 
 
 
 
252
 
253
  try:
254
- response = ""
255
- stream = hf_client.text_generation(
256
- prompt=full_prompt,
257
- max_new_tokens=4000,
258
- temperature=0.1, # ๋” ๊ฒฐ์ •์ ์ธ ์ถœ๋ ฅ์„ ์œ„ํ•ด ๋‚ฎ์ถค
259
- top_p=0.9,
260
- stream=True,
 
 
261
  )
262
-
263
- for msg in stream:
264
- if msg:
265
- response += msg
266
-
267
- # <EOS_TOKEN> ์ด์ „๊นŒ์ง€๋งŒ ์ถ”์ถœํ•˜๊ณ  ์ •์ œ
268
- if "<EOS_TOKEN>" in response:
269
- processed_text = response.split("<EOS_TOKEN>")[0].strip()
270
- else:
271
- processed_text = response.strip()
272
-
273
- # ์ค‘๋ณต ์ถœ๋ ฅ ์ œ๊ฑฐ
274
- lines = processed_text.split('\n')
275
- unique_lines = []
276
- seen_texts = set()
277
-
278
- for line in lines:
279
- line = line.strip()
280
- if line and '์ถœ๋ ฅ:' not in line and line not in seen_texts:
281
- unique_lines.append(line)
282
- seen_texts.add(line)
283
-
284
- processed_text = '\n'.join(unique_lines)
285
 
286
  # CSV ํ˜•์‹ ๊ฒ€์ฆ
287
  try:
@@ -599,3 +631,5 @@ with gr.Blocks(css=css) as demo:
599
  if __name__ == "__main__":
600
  demo.launch(share=True)
601
 
 
 
 
7
  import io
8
  import traceback
9
  import csv
10
+ # HuggingFace ํด๋ผ์ด์–ธํŠธ ๋Œ€์‹  OpenAI ํด๋ผ์ด์–ธํŠธ ์‚ฌ์šฉ
11
+ from openai import OpenAI
12
+ import os
13
 
14
  # ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
15
  hf_client = InferenceClient(
 
34
  except Exception as e:
35
  return f"ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
36
 
37
+
38
+ # OpenAI ํด๋ผ์ด์–ธํŠธ ์„ค์ •
39
+ client = OpenAI(api_key=os.getenv("OPEN_AI"))
40
+
41
+ # respond ํ•จ์ˆ˜ ์ˆ˜์ •
42
  def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
43
+ # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ์„ค์ •
44
  system_prefix = """๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€๋กœ ๋‹ต๋ณ€ํ•  ๊ฒƒ. ๋„ˆ๋Š” ์—…๋กœ๋“œ๋œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜๋Š” ์—ญํ• ์„ ํ•œ๋‹ค.
45
 
46
  ์ฃผ์š” ์ง€์นจ:
 
58
  system_prefix += f"\n\n๋ฐ์ดํ„ฐ ์š”์•ฝ:\n{data_summary}"
59
  except Exception as e:
60
  print(f"๋ฐ์ดํ„ฐ ๋กœ๋“œ ์˜ค๋ฅ˜: {str(e)}")
61
+
62
+ # ๋Œ€ํ™” ํžˆ์Šคํ† ๋ฆฌ ๊ตฌ์„ฑ
63
+ messages = [{"role": "system", "content": system_prefix}]
64
 
65
  # ์ตœ๊ทผ ๋Œ€ํ™” ์ปจํ…์ŠคํŠธ๋งŒ ์œ ์ง€
66
  recent_history = history[-3:] if history else []
 
 
67
  for chat in recent_history:
68
+ messages.append({"role": chat["role"], "content": chat["content"]})
69
+
70
+ messages.append({"role": "user", "content": message})
 
 
71
 
72
  try:
73
+ # OpenAI API ํ˜ธ์ถœ
74
+ response = client.chat.completions.create(
75
+ model="gpt-4-0125-preview", # GPT-4-mini ๋ชจ๋ธ ์‚ฌ์šฉ
76
+ messages=messages,
77
+ max_tokens=max_tokens,
78
  temperature=temperature,
79
  top_p=top_p,
80
+ stream=True
81
  )
82
+
83
+ full_response = ""
84
+ for chunk in response:
85
+ if chunk.choices[0].delta.content:
86
+ full_response += chunk.choices[0].delta.content
87
  # ์‘๋‹ต ์ •์ œ
88
+ cleaned_response = clean_response(full_response)
89
  yield cleaned_response
90
+
91
  except Exception as e:
92
  error_message = f"์ถ”๋ก  ์˜ค๋ฅ˜: {str(e)}"
93
  print(error_message)
 
225
  print(f"{error_message}\n{traceback.format_exc()}")
226
  return error_message, "", ""
227
 
228
+ # preprocess_text_with_llm ํ•จ์ˆ˜๋„ ์ˆ˜์ •
229
  def preprocess_text_with_llm(input_text: str) -> str:
230
  if not input_text.strip():
231
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
 
242
  - Technology (๊ธฐ์ˆ )
243
  - Politics (์ •์น˜)
244
  - Culture (๋ฌธํ™”)
245
+ 5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด"""
246
 
247
+ try:
248
+ response = client.chat.completions.create(
249
+ model="gpt-4-0125-preview",
250
+ messages=[
251
+ {"role": "system", "content": system_prompt},
252
+ {"role": "user", "content": input_text}
253
+ ],
254
+ max_tokens=4000,
255
+ temperature=0.1,
256
+ stream=True
257
+ )
258
 
259
+ full_response = ""
260
+ for chunk in response:
261
+ if chunk.choices[0].delta.content:
262
+ full_response += chunk.choices[0].delta.content
263
 
264
+ # ์‘๋‹ต ์ •์ œ
265
+ processed_text = clean_response(full_response)
266
+
267
+ # CSV ํ˜•์‹ ๊ฒ€์ฆ
268
+ try:
269
+ from io import StringIO
270
+ import csv
271
+ csv.reader(StringIO(processed_text))
272
+ return processed_text
273
+ except csv.Error:
274
+ return "LLM์ด ์˜ฌ๋ฐ”๋ฅธ CSV ํ˜•์‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
275
+
276
+ except Exception as e:
277
+ error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
278
+ print(error_message)
279
+ return error_message# preprocess_text_with_llm ํ•จ์ˆ˜๋„ ์ˆ˜์ •
280
+ def preprocess_text_with_llm(input_text: str) -> str:
281
+ if not input_text.strip():
282
+ return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
283
+
284
+ system_prompt = """๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€(ํ•œ๊ตญ์–ด)๋กœ ๋‹ต๋ณ€ํ•˜์‹œ์˜ค. ๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ CSV ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•˜์„ธ์š”.
285
 
286
+ ๊ทœ์น™:
287
+ 1. ์ถœ๋ ฅ ํ˜•์‹: id,text,label,metadata
288
+ 2. id: 1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
289
+ 3. text: ์˜๋ฏธ ์žˆ๋Š” ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ๋œ ํ…์ŠคํŠธ
290
+ 4. label: ํ…์ŠคํŠธ์˜ ์ฃผ์ œ๋‚˜ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์•„๋ž˜ ๊ธฐ์ค€์œผ๋กœ ์ •ํ™•ํ•˜๊ฒŒ ํ•œ ๊ฐœ๋งŒ ์„ ํƒ
291
+ - Historical_Figure (์—ญ์‚ฌ์  ์ธ๋ฌผ)
292
+ - Military_History (๊ตฐ์‚ฌ ์—ญ์‚ฌ)
293
+ - Technology (๊ธฐ์ˆ )
294
+ - Politics (์ •์น˜)
295
+ - Culture (๋ฌธํ™”)
296
+ 5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด"""
297
 
298
  try:
299
+ response = client.chat.completions.create(
300
+ model="gpt-4o-mini",
301
+ messages=[
302
+ {"role": "system", "content": system_prompt},
303
+ {"role": "user", "content": input_text}
304
+ ],
305
+ max_tokens=4000,
306
+ temperature=0.1,
307
+ stream=True
308
  )
309
+
310
+ full_response = ""
311
+ for chunk in response:
312
+ if chunk.choices[0].delta.content:
313
+ full_response += chunk.choices[0].delta.content
314
+
315
+ # ์‘๋‹ต ์ •์ œ
316
+ processed_text = clean_response(full_response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
  # CSV ํ˜•์‹ ๊ฒ€์ฆ
319
  try:
 
631
  if __name__ == "__main__":
632
  demo.launch(share=True)
633
 
634
+
635
+ llm ๋ชจ๋ธ ๋ณ€๊ฒฝํ•˜๋ผ. openai api๋ฅผ ์ด์šฉํ•˜๊ณ  ๋ชจ๋ธ์€ gpt-4o-mini๋กœ ์„ค์ •ํ•˜๋ผ. apiํ‚ค๋Š” os.getenv("OPEN_AI")๋ฅผ ์ด์šฉํ•˜๋ผ