Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,107 @@ hf_client = InferenceClient(
|
|
13 |
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
|
14 |
)
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def load_code(filename: str) -> str:
|
17 |
try:
|
18 |
with open(filename, 'r', encoding='utf-8') as file:
|
|
|
13 |
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
|
14 |
)
|
15 |
|
16 |
+
from functools import lru_cache
|
17 |
+
from concurrent.futures import ThreadPoolExecutor
|
18 |
+
import math
|
19 |
+
|
20 |
+
def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
|
21 |
+
"""ํ
์คํธ๋ฅผ ๋ ์์ ์ฒญํฌ๋ก ๋ถํ """
|
22 |
+
sentences = text.split('.')
|
23 |
+
chunks = []
|
24 |
+
current_chunk = []
|
25 |
+
current_length = 0
|
26 |
+
|
27 |
+
for sentence in sentences:
|
28 |
+
sentence = sentence.strip() + '.'
|
29 |
+
if current_length + len(sentence) > chunk_size:
|
30 |
+
if current_chunk:
|
31 |
+
chunks.append(' '.join(current_chunk))
|
32 |
+
current_chunk = [sentence]
|
33 |
+
current_length = len(sentence)
|
34 |
+
else:
|
35 |
+
current_chunk.append(sentence)
|
36 |
+
current_length += len(sentence)
|
37 |
+
|
38 |
+
if current_chunk:
|
39 |
+
chunks.append(' '.join(current_chunk))
|
40 |
+
return chunks
|
41 |
+
|
42 |
+
@lru_cache(maxsize=100)
|
43 |
+
def cached_preprocess(text: str) -> str:
|
44 |
+
"""์์ฃผ ์ฌ์ฉ๋๋ ํ
์คํธ์ ๋ํ ์ ์ฒ๋ฆฌ ๊ฒฐ๊ณผ๋ฅผ ์บ์ฑ"""
|
45 |
+
return preprocess_single_chunk(text)
|
46 |
+
|
47 |
+
def preprocess_single_chunk(chunk: str) -> str:
|
48 |
+
"""๋จ์ผ ์ฒญํฌ์ ๋ํ ์ ์ฒ๋ฆฌ ์ํ"""
|
49 |
+
system_prompt = """๋น์ ์ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์ ๋ฌธ๊ฐ์
๋๋ค. ์
๋ ฅ๋ ํ
์คํธ๋ฅผ CSV ๋ฐ์ดํฐ์
ํ์์ผ๋ก ๋น ๋ฅด๊ฒ ๋ณํํ์ธ์.
|
50 |
+
[๊ธฐ์กด ๊ท์น ๋์ผ]"""
|
51 |
+
|
52 |
+
full_prompt = f"{system_prompt}\n\n์
๋ ฅํ
์คํธ:\n{chunk}\n\n์ถ๋ ฅ:"
|
53 |
+
|
54 |
+
try:
|
55 |
+
# ์คํธ๋ฆฌ๋ฐ ๋นํ์ฑํ ๋ฐ ํ๋ผ๋ฏธํฐ ์ต์ ํ
|
56 |
+
response = hf_client.text_generation(
|
57 |
+
prompt=full_prompt,
|
58 |
+
max_new_tokens=2000, # ํ ํฐ ์ ์ ํ
|
59 |
+
temperature=0.1, # ๋ ๊ฒฐ์ ์ ์ธ ์ถ๋ ฅ
|
60 |
+
top_p=0.5, # ๋ ์ง์ค๋ ์ถ๋ ฅ
|
61 |
+
stream=False # ์คํธ๋ฆฌ๋ฐ ๋นํ์ฑํ
|
62 |
+
)
|
63 |
+
|
64 |
+
return response.strip()
|
65 |
+
except Exception as e:
|
66 |
+
return f"์ฒญํฌ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}"
|
67 |
+
|
68 |
+
def preprocess_text_with_llm(input_text: str) -> str:
|
69 |
+
if not input_text.strip():
|
70 |
+
return "์
๋ ฅ ํ
์คํธ๊ฐ ๋น์ด์์ต๋๋ค."
|
71 |
+
|
72 |
+
try:
|
73 |
+
# ํ
์คํธ๋ฅผ ์ฒญํฌ๋ก ๋ถํ
|
74 |
+
chunks = chunk_text(input_text)
|
75 |
+
|
76 |
+
# ๋ณ๋ ฌ ์ฒ๋ฆฌ๋ก ์ฒญํฌ๋ค์ ์ฒ๋ฆฌ
|
77 |
+
with ThreadPoolExecutor(max_workers=3) as executor:
|
78 |
+
processed_chunks = list(executor.map(cached_preprocess, chunks))
|
79 |
+
|
80 |
+
# ๊ฒฐ๊ณผ ๋ณํฉ ๋ฐ ์ค๋ณต ์ ๊ฑฐ
|
81 |
+
all_lines = []
|
82 |
+
seen_texts = set()
|
83 |
+
current_id = 1
|
84 |
+
|
85 |
+
for chunk_result in processed_chunks:
|
86 |
+
lines = chunk_result.split('\n')
|
87 |
+
for line in lines:
|
88 |
+
line = line.strip()
|
89 |
+
if line and '์ถ๋ ฅ:' not in line and line not in seen_texts:
|
90 |
+
# ID ์ฌํ ๋น
|
91 |
+
parts = line.split(',', 1)
|
92 |
+
if len(parts) > 1:
|
93 |
+
new_line = f"{current_id},{parts[1]}"
|
94 |
+
all_lines.append(new_line)
|
95 |
+
seen_texts.add(new_line)
|
96 |
+
current_id += 1
|
97 |
+
|
98 |
+
processed_text = '\n'.join(all_lines)
|
99 |
+
|
100 |
+
# CSV ํ์ ๊ฒ์ฆ
|
101 |
+
try:
|
102 |
+
from io import StringIO
|
103 |
+
import csv
|
104 |
+
csv.reader(StringIO(processed_text))
|
105 |
+
return processed_text
|
106 |
+
except csv.Error:
|
107 |
+
return "LLM์ด ์ฌ๋ฐ๋ฅธ CSV ํ์์ ์์ฑํ์ง ๋ชปํ์ต๋๋ค. ๋ค์ ์๋ํด์ฃผ์ธ์."
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
error_message = f"์ ์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
111 |
+
print(error_message)
|
112 |
+
return error_message
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
def load_code(filename: str) -> str:
|
118 |
try:
|
119 |
with open(filename, 'r', encoding='utf-8') as file:
|