Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,9 @@ import pandas as pd
|
|
5 |
from typing import List, Dict, Tuple
|
6 |
|
7 |
# μΆλ‘ API ν΄λΌμ΄μΈνΈ μ€μ
|
8 |
-
hf_client = InferenceClient(
|
|
|
|
|
9 |
|
10 |
def load_code(filename: str) -> str:
|
11 |
try:
|
@@ -42,13 +44,11 @@ def respond(
|
|
42 |
# μμ€ν
ν둬ννΈ μ€μ
|
43 |
system_prefix = """λ°λμ νκΈλ‘ λ΅λ³ν κ². λλ μ£Όμ΄μ§ μμ€μ½λλ₯Ό κΈ°λ°μΌλ‘ "μλΉμ€ μ¬μ© μ€λͺ
λ° μλ΄, Q&Aλ₯Ό νλ μν μ΄λ€". μμ£Ό μΉμ νκ³ μμΈνκ² Markdown νμμΌλ‘ μμ±νλΌ. λλ μ½λλ₯Ό κΈ°λ°μΌλ‘ μ¬μ© μ€λͺ
λ° μ§μ μλ΅μ μ§ννλ©°, μ΄μ©μμκ² λμμ μ£Όμ΄μΌ νλ€. μ΄μ©μκ° κΆκΈν΄ν λ§ν λ΄μ©μ μΉμ νκ² μλ €μ£Όλλ‘ νλΌ. μ½λ μ 체 λ΄μ©μ λν΄μλ 보μμ μ μ§νκ³ , ν€ κ° λ° μλν¬μΈνΈμ ꡬ체μ μΈ λͺ¨λΈμ 곡κ°νμ§ λ§λΌ."""
|
44 |
|
45 |
-
|
46 |
# Parquet λ°μ΄ν° ν¬ν¨
|
47 |
if parquet_data:
|
48 |
df = pd.read_json(parquet_data)
|
49 |
parquet_content = df.head(10).to_markdown(index=False)
|
50 |
system_prefix += f"\n\nμ
λ‘λλ Parquet νμΌ λ΄μ©:\n```markdown\n{parquet_content}\n```"
|
51 |
-
message = "μ
λ‘λλ Parquet νμΌμ λν λ΄μ©μ νμ΅νμμ΅λλ€. κ΄λ ¨νμ¬ κΆκΈν μ μ΄ μμΌλ©΄ λ¬Όμ΄λ³΄μΈμ."
|
52 |
|
53 |
# μμ€ν
λ©μμ§μ λν κΈ°λ‘ κ²°ν©
|
54 |
messages = [{"role": "system", "content": system_prefix}]
|
@@ -77,26 +77,20 @@ def upload_csv(file_path: str) -> Tuple[str, str]:
|
|
77 |
try:
|
78 |
# CSV νμΌ μ½κΈ°
|
79 |
df = pd.read_csv(file_path, sep=',')
|
80 |
-
|
81 |
# νμ μ»¬λΌ νμΈ
|
82 |
required_columns = {'id', 'text', 'label', 'metadata'}
|
83 |
available_columns = set(df.columns)
|
84 |
missing_columns = required_columns - available_columns
|
85 |
-
|
86 |
if missing_columns:
|
87 |
return f"CSV νμΌμ λ€μ νμ 컬λΌμ΄ λλ½λμμ΅λλ€: {', '.join(missing_columns)}", ""
|
88 |
-
|
89 |
# λ°μ΄ν° ν΄λ μ§
|
90 |
df.drop_duplicates(inplace=True)
|
91 |
df.fillna('', inplace=True)
|
92 |
-
|
93 |
# λ°μ΄ν° μ ν μ΅μ ν
|
94 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
|
95 |
-
|
96 |
# Parquet νμΌλ‘ λ³ν
|
97 |
parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
|
98 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
99 |
-
|
100 |
return f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλκ³ λ³νλμμ΅λλ€.", parquet_filename
|
101 |
except Exception as e:
|
102 |
return f"CSV νμΌ μ
λ‘λ λ° λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", ""
|
@@ -105,13 +99,10 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
|
|
105 |
try:
|
106 |
# Parquet νμΌ μ½κΈ°
|
107 |
df = pd.read_parquet(file_path, engine='pyarrow')
|
108 |
-
|
109 |
# MarkdownμΌλ‘ λ³ννμ¬ λ―Έλ¦¬λ³΄κΈ°
|
110 |
parquet_content = df.head(10).to_markdown(index=False)
|
111 |
-
|
112 |
# DataFrameμ JSONμΌλ‘ λ³ν
|
113 |
parquet_json = df.to_json()
|
114 |
-
|
115 |
return "Parquet νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλμμ΅λλ€.", parquet_content, parquet_json
|
116 |
except Exception as e:
|
117 |
return f"Parquet νμΌ μ
λ‘λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "", ""
|
@@ -121,17 +112,13 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
|
121 |
# ν
μ€νΈλ₯Ό DataFrameμΌλ‘ λ³ν (κ° νμ μ½€λ§λ‘ ꡬλΆ)
|
122 |
data = [line.split(',') for line in text.strip().split('\n')]
|
123 |
df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
|
124 |
-
|
125 |
# λ°μ΄ν° μ ν μ΅μ ν
|
126 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
|
127 |
-
|
128 |
# Parquet νμΌλ‘ λ³ν
|
129 |
parquet_filename = 'text_to_parquet.parquet'
|
130 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
131 |
-
|
132 |
# Parquet νμΌ λ΄μ© 미리보기
|
133 |
parquet_content = load_parquet(parquet_filename)
|
134 |
-
|
135 |
return f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ λ³νλμμ΅λλ€.", parquet_content, parquet_filename
|
136 |
except Exception as e:
|
137 |
return f"ν
μ€νΈ λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "", ""
|
@@ -142,7 +129,7 @@ footer {
|
|
142 |
visibility: hidden;
|
143 |
}
|
144 |
#chatbot-container, #chatbot-data-upload {
|
145 |
-
height:
|
146 |
overflow-y: scroll;
|
147 |
}
|
148 |
#chatbot-container .message, #chatbot-data-upload .message {
|
@@ -153,21 +140,35 @@ textarea, input[type="text"] {
|
|
153 |
background-color: #ffffff; /* ν°μ λ°°κ²½ */
|
154 |
color: #000000; /* κ²μ μ κΈμ */
|
155 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
"""
|
157 |
|
158 |
# Gradio Blocks μΈν°νμ΄μ€ μ€μ
|
159 |
with gr.Blocks(css=css) as demo:
|
160 |
-
gr.Markdown("# My RAG: LLMμ΄ λλ§μ λ°μ΄ν°λ‘ νμ΅ν μ½ν
μΈ μμ±/λ΅λ³")
|
161 |
-
gr.Markdown(
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
165 |
# 첫 λ²μ§Έ ν: μ±λ΄ λ°μ΄ν° μ
λ‘λ (ν μ΄λ¦ λ³κ²½: "My λ°μ΄ν°μ
+LLM")
|
166 |
with gr.Tab("My λ°μ΄ν°μ
+LLM"):
|
167 |
gr.Markdown("### Parquet νμΌ μ
λ‘λ λ° μ§λ¬ΈνκΈ°")
|
168 |
with gr.Row():
|
169 |
with gr.Column():
|
170 |
-
parquet_upload = gr.File(
|
|
|
|
|
171 |
parquet_upload_button = gr.Button("μ
λ‘λ")
|
172 |
parquet_upload_status = gr.Textbox(label="μ
λ‘λ μν", interactive=False)
|
173 |
parquet_preview_chat = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°")
|
@@ -197,33 +198,65 @@ with gr.Blocks(css=css) as demo:
|
|
197 |
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
|
198 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")
|
199 |
|
200 |
-
def handle_message_data_upload(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
history = history or []
|
202 |
-
history.append({"role": "user", "content": message})
|
203 |
try:
|
204 |
# μλ΅ μμ±
|
205 |
-
response_gen = respond(
|
|
|
|
|
206 |
partial_response = ""
|
207 |
for partial in response_gen:
|
208 |
partial_response = partial
|
209 |
-
#
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
yield
|
|
|
|
|
|
|
215 |
except Exception as e:
|
216 |
response = f"μΆλ‘ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
|
|
217 |
history.append({"role": "assistant", "content": response})
|
218 |
yield history, ""
|
219 |
|
220 |
send_data_upload.click(
|
221 |
handle_message_data_upload,
|
222 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
outputs=[chatbot_data_upload, msg_data_upload],
|
224 |
queue=True
|
225 |
)
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
# λ λ²μ§Έ ν: λ°μ΄ν° λ³ν (ν μ΄λ¦ λ³κ²½: "CSV to My λ°μ΄ν°μ
")
|
228 |
with gr.Tab("CSV to My λ°μ΄ν°μ
"):
|
229 |
gr.Markdown("### CSV νμΌ μ
λ‘λ λ° Parquet λ³ν")
|
@@ -277,8 +310,8 @@ with gr.Blocks(css=css) as demo:
|
|
277 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
278 |
)
|
279 |
|
280 |
-
|
281 |
-
gr.Markdown("### [email protected]")
|
282 |
|
283 |
if __name__ == "__main__":
|
284 |
demo.launch()
|
|
|
|
5 |
from typing import List, Dict, Tuple
|
6 |
|
7 |
# μΆλ‘ API ν΄λΌμ΄μΈνΈ μ€μ
|
8 |
+
hf_client = InferenceClient(
|
9 |
+
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
|
10 |
+
)
|
11 |
|
12 |
def load_code(filename: str) -> str:
|
13 |
try:
|
|
|
44 |
# μμ€ν
ν둬ννΈ μ€μ
|
45 |
system_prefix = """λ°λμ νκΈλ‘ λ΅λ³ν κ². λλ μ£Όμ΄μ§ μμ€μ½λλ₯Ό κΈ°λ°μΌλ‘ "μλΉμ€ μ¬μ© μ€λͺ
λ° μλ΄, Q&Aλ₯Ό νλ μν μ΄λ€". μμ£Ό μΉμ νκ³ μμΈνκ² Markdown νμμΌλ‘ μμ±νλΌ. λλ μ½λλ₯Ό κΈ°λ°μΌλ‘ μ¬μ© μ€λͺ
λ° μ§μ μλ΅μ μ§ννλ©°, μ΄μ©μμκ² λμμ μ£Όμ΄μΌ νλ€. μ΄μ©μκ° κΆκΈν΄ν λ§ν λ΄μ©μ μΉμ νκ² μλ €μ£Όλλ‘ νλΌ. μ½λ μ 체 λ΄μ©μ λν΄μλ 보μμ μ μ§νκ³ , ν€ κ° λ° μλν¬μΈνΈμ ꡬ체μ μΈ λͺ¨λΈμ 곡κ°νμ§ λ§λΌ."""
|
46 |
|
|
|
47 |
# Parquet λ°μ΄ν° ν¬ν¨
|
48 |
if parquet_data:
|
49 |
df = pd.read_json(parquet_data)
|
50 |
parquet_content = df.head(10).to_markdown(index=False)
|
51 |
system_prefix += f"\n\nμ
λ‘λλ Parquet νμΌ λ΄μ©:\n```markdown\n{parquet_content}\n```"
|
|
|
52 |
|
53 |
# μμ€ν
λ©μμ§μ λν κΈ°λ‘ κ²°ν©
|
54 |
messages = [{"role": "system", "content": system_prefix}]
|
|
|
77 |
try:
|
78 |
# CSV νμΌ μ½κΈ°
|
79 |
df = pd.read_csv(file_path, sep=',')
|
|
|
80 |
# νμ μ»¬λΌ νμΈ
|
81 |
required_columns = {'id', 'text', 'label', 'metadata'}
|
82 |
available_columns = set(df.columns)
|
83 |
missing_columns = required_columns - available_columns
|
|
|
84 |
if missing_columns:
|
85 |
return f"CSV νμΌμ λ€μ νμ 컬λΌμ΄ λλ½λμμ΅λλ€: {', '.join(missing_columns)}", ""
|
|
|
86 |
# λ°μ΄ν° ν΄λ μ§
|
87 |
df.drop_duplicates(inplace=True)
|
88 |
df.fillna('', inplace=True)
|
|
|
89 |
# λ°μ΄ν° μ ν μ΅μ ν
|
90 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
|
|
|
91 |
# Parquet νμΌλ‘ λ³ν
|
92 |
parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
|
93 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
|
|
94 |
return f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλκ³ λ³νλμμ΅λλ€.", parquet_filename
|
95 |
except Exception as e:
|
96 |
return f"CSV νμΌ μ
λ‘λ λ° λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", ""
|
|
|
99 |
try:
|
100 |
# Parquet νμΌ μ½κΈ°
|
101 |
df = pd.read_parquet(file_path, engine='pyarrow')
|
|
|
102 |
# MarkdownμΌλ‘ λ³ννμ¬ λ―Έλ¦¬λ³΄κΈ°
|
103 |
parquet_content = df.head(10).to_markdown(index=False)
|
|
|
104 |
# DataFrameμ JSONμΌλ‘ λ³ν
|
105 |
parquet_json = df.to_json()
|
|
|
106 |
return "Parquet νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλμμ΅λλ€.", parquet_content, parquet_json
|
107 |
except Exception as e:
|
108 |
return f"Parquet νμΌ μ
λ‘λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "", ""
|
|
|
112 |
# ν
μ€νΈλ₯Ό DataFrameμΌλ‘ λ³ν (κ° νμ μ½€λ§λ‘ ꡬλΆ)
|
113 |
data = [line.split(',') for line in text.strip().split('\n')]
|
114 |
df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
|
|
|
115 |
# λ°μ΄ν° μ ν μ΅μ ν
|
116 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
|
|
|
117 |
# Parquet νμΌλ‘ λ³ν
|
118 |
parquet_filename = 'text_to_parquet.parquet'
|
119 |
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
|
|
|
120 |
# Parquet νμΌ λ΄μ© 미리보기
|
121 |
parquet_content = load_parquet(parquet_filename)
|
|
|
122 |
return f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ λ³νλμμ΅λλ€.", parquet_content, parquet_filename
|
123 |
except Exception as e:
|
124 |
return f"ν
μ€νΈ λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "", ""
|
|
|
129 |
visibility: hidden;
|
130 |
}
|
131 |
#chatbot-container, #chatbot-data-upload {
|
132 |
+
height: 700px;
|
133 |
overflow-y: scroll;
|
134 |
}
|
135 |
#chatbot-container .message, #chatbot-data-upload .message {
|
|
|
140 |
background-color: #ffffff; /* ν°μ λ°°κ²½ */
|
141 |
color: #000000; /* κ²μ μ κΈμ */
|
142 |
}
|
143 |
+
/* νμΌ μ
λ‘λ μμ λμ΄ μ‘°μ */
|
144 |
+
#parquet-upload-area {
|
145 |
+
max-height: 150px;
|
146 |
+
overflow-y: auto;
|
147 |
+
}
|
148 |
+
/* μ΄κΈ° μ€λͺ
κΈμ¨ ν¬κΈ° μ‘°μ */
|
149 |
+
#initial-description {
|
150 |
+
font-size: 14px;
|
151 |
+
}
|
152 |
"""
|
153 |
|
154 |
# Gradio Blocks μΈν°νμ΄μ€ μ€μ
|
155 |
with gr.Blocks(css=css) as demo:
|
156 |
+
gr.Markdown("# My RAG: LLMμ΄ λλ§μ λ°μ΄ν°λ‘ νμ΅ν μ½ν
μΈ μμ±/λ΅λ³", elem_id="initial-description")
|
157 |
+
gr.Markdown(
|
158 |
+
"### 1) λλ§μ λ°μ΄ν°λ₯Ό ν
μ€νΈλ‘ μ
λ ₯νκ±°λ CSVλ₯Ό μ
λ‘λνμ¬ Parquet ν¬λ§· λ°μ΄ν°μ
μλ λ³νν©λλ€.\n"
|
159 |
+
"### 2) Parquet ν¬λ§· λ°μ΄ν°μ
μ μ
λ‘λνλ©΄, LLMμ΄ λ§μΆ€ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬ μλ΅μ μμν©λλ€.\n"
|
160 |
+
"### Tip) 'μμ 'λ₯Ό ν΅ν΄ λ€μν νμ© λ°©λ²μ 체ννκ³ μμ©ν΄ 보μΈμ.",
|
161 |
+
elem_id="initial-description"
|
162 |
+
)
|
163 |
+
|
164 |
# 첫 λ²μ§Έ ν: μ±λ΄ λ°μ΄ν° μ
λ‘λ (ν μ΄λ¦ λ³κ²½: "My λ°μ΄ν°μ
+LLM")
|
165 |
with gr.Tab("My λ°μ΄ν°μ
+LLM"):
|
166 |
gr.Markdown("### Parquet νμΌ μ
λ‘λ λ° μ§λ¬ΈνκΈ°")
|
167 |
with gr.Row():
|
168 |
with gr.Column():
|
169 |
+
parquet_upload = gr.File(
|
170 |
+
label="Parquet νμΌ μ
λ‘λ", type="filepath", elem_id="parquet-upload-area"
|
171 |
+
)
|
172 |
parquet_upload_button = gr.Button("μ
λ‘λ")
|
173 |
parquet_upload_status = gr.Textbox(label="μ
λ‘λ μν", interactive=False)
|
174 |
parquet_preview_chat = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°")
|
|
|
198 |
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
|
199 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")
|
200 |
|
201 |
+
def handle_message_data_upload(
|
202 |
+
message: str,
|
203 |
+
history: List[Dict[str, str]],
|
204 |
+
system_message: str,
|
205 |
+
max_tokens: int,
|
206 |
+
temperature: float,
|
207 |
+
top_p: float,
|
208 |
+
parquet_data: str
|
209 |
+
):
|
210 |
history = history or []
|
|
|
211 |
try:
|
212 |
# μλ΅ μμ±
|
213 |
+
response_gen = respond(
|
214 |
+
message, history, system_message, max_tokens, temperature, top_p, parquet_data
|
215 |
+
)
|
216 |
partial_response = ""
|
217 |
for partial in response_gen:
|
218 |
partial_response = partial
|
219 |
+
# λν λ΄μ μ
λ°μ΄νΈ
|
220 |
+
display_history = history + [
|
221 |
+
{"role": "user", "content": message},
|
222 |
+
{"role": "assistant", "content": partial_response}
|
223 |
+
]
|
224 |
+
yield display_history, ""
|
225 |
+
# λν λ΄μμ μΆκ°
|
226 |
+
history.append({"role": "user", "content": message})
|
227 |
+
history.append({"role": "assistant", "content": partial_response})
|
228 |
except Exception as e:
|
229 |
response = f"μΆλ‘ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
230 |
+
history.append({"role": "user", "content": message})
|
231 |
history.append({"role": "assistant", "content": response})
|
232 |
yield history, ""
|
233 |
|
234 |
send_data_upload.click(
|
235 |
handle_message_data_upload,
|
236 |
+
inputs=[
|
237 |
+
msg_data_upload,
|
238 |
+
chatbot_data_upload,
|
239 |
+
system_message,
|
240 |
+
max_tokens,
|
241 |
+
temperature,
|
242 |
+
top_p,
|
243 |
+
parquet_data_state
|
244 |
+
],
|
245 |
outputs=[chatbot_data_upload, msg_data_upload],
|
246 |
queue=True
|
247 |
)
|
248 |
|
249 |
+
# μμ μΆκ°
|
250 |
+
with gr.Accordion("μμ ", open=False):
|
251 |
+
gr.Examples(
|
252 |
+
examples=[
|
253 |
+
["μ
λ‘λλ λ°μ΄ν°μ
μ λ΄μ© μ€ λ¦¬μ€νΈ 5κ° νλͺ©μ μΆλ ₯νλΌ"],
|
254 |
+
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄οΏ½οΏ½οΏ½λ‘ νμ©νμ¬ μ λ¬Έ λΈλ‘κ·Έ ν¬μ€νΈλ₯Ό 4000 ν ν° μ΄μ μμ±νλΌ"],
|
255 |
+
],
|
256 |
+
inputs=msg_data_upload,
|
257 |
+
label="μμ μ ν",
|
258 |
+
)
|
259 |
+
|
260 |
# λ λ²μ§Έ ν: λ°μ΄ν° λ³ν (ν μ΄λ¦ λ³κ²½: "CSV to My λ°μ΄ν°μ
")
|
261 |
with gr.Tab("CSV to My λ°μ΄ν°μ
"):
|
262 |
gr.Markdown("### CSV νμΌ μ
λ‘λ λ° Parquet λ³ν")
|
|
|
310 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
311 |
)
|
312 |
|
313 |
+
gr.Markdown("### [email protected]", elem_id="initial-description")
|
|
|
314 |
|
315 |
if __name__ == "__main__":
|
316 |
demo.launch()
|
317 |
+
|