ginipick commited on
Commit
6998afd
โ€ข
1 Parent(s): b3bb461

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py CHANGED
@@ -13,6 +13,107 @@ hf_client = InferenceClient(
13
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
14
  )
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def load_code(filename: str) -> str:
17
  try:
18
  with open(filename, 'r', encoding='utf-8') as file:
 
13
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
14
  )
15
 
16
+ from functools import lru_cache
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ import math
19
+
20
+ def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
21
+ """ํ…์ŠคํŠธ๋ฅผ ๋” ์ž‘์€ ์ฒญํฌ๋กœ ๋ถ„ํ• """
22
+ sentences = text.split('.')
23
+ chunks = []
24
+ current_chunk = []
25
+ current_length = 0
26
+
27
+ for sentence in sentences:
28
+ sentence = sentence.strip() + '.'
29
+ if current_length + len(sentence) > chunk_size:
30
+ if current_chunk:
31
+ chunks.append(' '.join(current_chunk))
32
+ current_chunk = [sentence]
33
+ current_length = len(sentence)
34
+ else:
35
+ current_chunk.append(sentence)
36
+ current_length += len(sentence)
37
+
38
+ if current_chunk:
39
+ chunks.append(' '.join(current_chunk))
40
+ return chunks
41
+
42
+ @lru_cache(maxsize=100)
43
+ def cached_preprocess(text: str) -> str:
44
+ """์ž์ฃผ ์‚ฌ์šฉ๋˜๋Š” ํ…์ŠคํŠธ์— ๋Œ€ํ•œ ์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ๋ฅผ ์บ์‹ฑ"""
45
+ return preprocess_single_chunk(text)
46
+
47
+ def preprocess_single_chunk(chunk: str) -> str:
48
+ """๋‹จ์ผ ์ฒญํฌ์— ๋Œ€ํ•œ ์ „์ฒ˜๋ฆฌ ์ˆ˜ํ–‰"""
49
+ system_prompt = """๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ CSV ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ๋น ๋ฅด๊ฒŒ ๋ณ€ํ™˜ํ•˜์„ธ์š”.
50
+ [๊ธฐ์กด ๊ทœ์น™ ๋™์ผ]"""
51
+
52
+ full_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{chunk}\n\n์ถœ๋ ฅ:"
53
+
54
+ try:
55
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ๋น„ํ™œ์„ฑํ™” ๋ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์ตœ์ ํ™”
56
+ response = hf_client.text_generation(
57
+ prompt=full_prompt,
58
+ max_new_tokens=2000, # ํ† ํฐ ์ˆ˜ ์ œํ•œ
59
+ temperature=0.1, # ๋” ๊ฒฐ์ •์ ์ธ ์ถœ๋ ฅ
60
+ top_p=0.5, # ๋” ์ง‘์ค‘๋œ ์ถœ๋ ฅ
61
+ stream=False # ์ŠคํŠธ๋ฆฌ๋ฐ ๋น„ํ™œ์„ฑํ™”
62
+ )
63
+
64
+ return response.strip()
65
+ except Exception as e:
66
+ return f"์ฒญํฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
67
+
68
+ def preprocess_text_with_llm(input_text: str) -> str:
69
+ if not input_text.strip():
70
+ return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
71
+
72
+ try:
73
+ # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
74
+ chunks = chunk_text(input_text)
75
+
76
+ # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ์ฒญํฌ๋“ค์„ ์ฒ˜๋ฆฌ
77
+ with ThreadPoolExecutor(max_workers=3) as executor:
78
+ processed_chunks = list(executor.map(cached_preprocess, chunks))
79
+
80
+ # ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
81
+ all_lines = []
82
+ seen_texts = set()
83
+ current_id = 1
84
+
85
+ for chunk_result in processed_chunks:
86
+ lines = chunk_result.split('\n')
87
+ for line in lines:
88
+ line = line.strip()
89
+ if line and '์ถœ๋ ฅ:' not in line and line not in seen_texts:
90
+ # ID ์žฌํ• ๋‹น
91
+ parts = line.split(',', 1)
92
+ if len(parts) > 1:
93
+ new_line = f"{current_id},{parts[1]}"
94
+ all_lines.append(new_line)
95
+ seen_texts.add(new_line)
96
+ current_id += 1
97
+
98
+ processed_text = '\n'.join(all_lines)
99
+
100
+ # CSV ํ˜•์‹ ๊ฒ€์ฆ
101
+ try:
102
+ from io import StringIO
103
+ import csv
104
+ csv.reader(StringIO(processed_text))
105
+ return processed_text
106
+ except csv.Error:
107
+ return "LLM์ด ์˜ฌ๋ฐ”๋ฅธ CSV ํ˜•์‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
108
+
109
+ except Exception as e:
110
+ error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
111
+ print(error_message)
112
+ return error_message
113
+
114
+
115
+
116
+
117
  def load_code(filename: str) -> str:
118
  try:
119
  with open(filename, 'r', encoding='utf-8') as file: