Update app.py
Browse files
app.py
CHANGED
@@ -2,12 +2,13 @@ import gradio as gr
|
|
2 |
from huggingface_hub import InferenceClient
|
3 |
import os
|
4 |
import pandas as pd
|
|
|
5 |
from typing import List, Tuple
|
6 |
|
7 |
# LLM Models Definition
|
8 |
LLM_MODELS = {
|
9 |
"Cohere c4ai-crp-08-2024": "CohereForAI/c4ai-command-r-plus-08-2024", # Default
|
10 |
-
"Meta Llama3.3-70B": "meta-llama/Llama-3.3-70B-Instruct",
|
11 |
"Mistral Nemo 2407": "mistralai/Mistral-Nemo-Instruct-2407",
|
12 |
"Alibaba Qwen QwQ-32B": "Qwen/QwQ-32B-Preview"
|
13 |
}
|
@@ -17,11 +18,18 @@ def get_client(model_name):
|
|
17 |
|
18 |
def analyze_file_content(content, file_type):
|
19 |
"""Analyze file content and return structural summary"""
|
20 |
-
if file_type in ['parquet', 'csv']:
|
21 |
try:
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
header = lines[0]
|
24 |
-
columns = header.
|
25 |
rows = len(lines) - 3
|
26 |
return f"π Dataset Structure: {columns} columns, {rows} data samples"
|
27 |
except:
|
@@ -47,39 +55,36 @@ def read_uploaded_file(file):
|
|
47 |
try:
|
48 |
file_ext = os.path.splitext(file.name)[1].lower()
|
49 |
|
50 |
-
if file_ext
|
51 |
-
|
|
|
|
|
|
|
52 |
content = df.head(10).to_markdown(index=False)
|
53 |
-
return content,
|
54 |
elif file_ext == '.csv':
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
for col, null_count in null_counts[null_counts > 0].items():
|
71 |
-
content += f"- {col}: {null_count} missing\n"
|
72 |
-
return content, "csv"
|
73 |
-
except UnicodeDecodeError:
|
74 |
-
continue
|
75 |
-
raise UnicodeDecodeError(f"β Unable to read file with supported encodings ({', '.join(encodings)})")
|
76 |
else:
|
77 |
encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
|
78 |
for encoding in encodings:
|
79 |
try:
|
80 |
with open(file.name, 'r', encoding=encoding) as f:
|
81 |
content = f.read()
|
82 |
-
return content,
|
83 |
except UnicodeDecodeError:
|
84 |
continue
|
85 |
raise UnicodeDecodeError(f"β Unable to read file with supported encodings ({', '.join(encodings)})")
|
@@ -96,45 +101,55 @@ def format_history(history):
|
|
96 |
|
97 |
def chat(message, history, uploaded_file, model_name, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
|
98 |
system_prefix = """You are a file analysis expert. Analyze the uploaded file in depth from the following perspectives:
|
99 |
-
1. π Overall structure and composition
|
100 |
-
2. π Key content and pattern analysis
|
101 |
-
3. π Data characteristics and meaning
|
102 |
-
|
103 |
-
|
104 |
-
4. π‘ Potential applications
|
105 |
-
5. β¨ Data quality and areas for improvement
|
106 |
-
Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. Format the analysis results in Markdown and include specific examples where possible."""
|
107 |
|
108 |
if uploaded_file:
|
109 |
content, file_type = read_uploaded_file(uploaded_file)
|
110 |
if file_type == "error":
|
111 |
-
|
112 |
-
return
|
113 |
|
114 |
file_summary = analyze_file_content(content, file_type)
|
115 |
|
116 |
-
if file_type in ['parquet', 'csv']:
|
117 |
system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
|
118 |
else:
|
119 |
system_message += f"\n\nFile Content:\n```\n{content}\n```"
|
120 |
|
121 |
if message == "Starting file analysis...":
|
122 |
-
message = f"""[
|
123 |
-
|
124 |
-
1. π
|
125 |
-
2. π
|
126 |
-
3. π
|
127 |
-
4. β
|
128 |
-
5. π‘
|
129 |
-
6. π―
|
130 |
|
131 |
messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
messages.append({"role": "user", "content": message})
|
134 |
|
135 |
try:
|
136 |
client = get_client(model_name)
|
137 |
partial_message = ""
|
|
|
138 |
|
139 |
for msg in client.chat_completion(
|
140 |
messages,
|
@@ -146,11 +161,19 @@ Please provide detailed analysis from these perspectives:
|
|
146 |
token = msg.choices[0].delta.get('content', None)
|
147 |
if token:
|
148 |
partial_message += token
|
149 |
-
|
|
|
|
|
|
|
|
|
150 |
|
151 |
except Exception as e:
|
152 |
error_msg = f"β Inference error: {str(e)}"
|
153 |
-
|
|
|
|
|
|
|
|
|
154 |
|
155 |
css = """
|
156 |
footer {visibility: hidden}
|
@@ -172,38 +195,36 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
|
|
172 |
with gr.Column(scale=2):
|
173 |
chatbot = gr.Chatbot(
|
174 |
height=600,
|
175 |
-
label="
|
176 |
-
type="messages"
|
177 |
)
|
178 |
msg = gr.Textbox(
|
179 |
-
label="
|
180 |
show_label=False,
|
181 |
-
placeholder="
|
182 |
container=False
|
183 |
)
|
184 |
-
|
185 |
-
clear = gr.ClearButton([msg, chatbot]) # label οΏ½οΏ½κ±°
|
186 |
-
send = gr.Button("Send π€")
|
187 |
|
188 |
with gr.Column(scale=1):
|
189 |
model_name = gr.Radio(
|
190 |
choices=list(LLM_MODELS.keys()),
|
191 |
value="Cohere c4ai-crp-08-2024",
|
192 |
-
label="
|
193 |
-
info="
|
194 |
)
|
195 |
|
|
|
196 |
file_upload = gr.File(
|
197 |
-
label="
|
198 |
-
|
199 |
-
file_types=["text", ".csv", ".parquet"],
|
200 |
type="filepath"
|
201 |
)
|
202 |
|
203 |
-
with gr.Accordion("
|
204 |
-
system_message = gr.Textbox(label="
|
205 |
-
max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="
|
206 |
-
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="
|
207 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P π")
|
208 |
|
209 |
# Event bindings
|
@@ -218,7 +239,7 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
|
|
218 |
[msg]
|
219 |
)
|
220 |
|
221 |
-
send.click(
|
222 |
chat,
|
223 |
inputs=[msg, chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
|
224 |
outputs=[msg, chatbot],
|
@@ -232,7 +253,7 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
|
|
232 |
# Auto-analysis on file upload
|
233 |
file_upload.change(
|
234 |
chat,
|
235 |
-
inputs=[gr.Textbox(value="
|
236 |
outputs=[msg, chatbot],
|
237 |
queue=True
|
238 |
)
|
@@ -240,12 +261,12 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
|
|
240 |
# Example queries
|
241 |
gr.Examples(
|
242 |
examples=[
|
243 |
-
["
|
244 |
-
["
|
245 |
-
["
|
246 |
-
["
|
247 |
-
["
|
248 |
-
["
|
249 |
],
|
250 |
inputs=msg,
|
251 |
)
|
|
|
2 |
from huggingface_hub import InferenceClient
|
3 |
import os
|
4 |
import pandas as pd
|
5 |
+
import pdfplumber
|
6 |
from typing import List, Tuple
|
7 |
|
8 |
# LLM Models Definition
|
9 |
LLM_MODELS = {
|
10 |
"Cohere c4ai-crp-08-2024": "CohereForAI/c4ai-command-r-plus-08-2024", # Default
|
11 |
+
"Meta Llama3.3-70B": "meta-llama/Llama-3.3-70B-Instruct",
|
12 |
"Mistral Nemo 2407": "mistralai/Mistral-Nemo-Instruct-2407",
|
13 |
"Alibaba Qwen QwQ-32B": "Qwen/QwQ-32B-Preview"
|
14 |
}
|
|
|
18 |
|
19 |
def analyze_file_content(content, file_type):
|
20 |
"""Analyze file content and return structural summary"""
|
21 |
+
if file_type in ['parquet', 'csv', 'pdf']:
|
22 |
try:
|
23 |
+
if file_type == 'pdf':
|
24 |
+
with pdfplumber.open(content) as pdf:
|
25 |
+
pages = pdf.pages
|
26 |
+
lines = []
|
27 |
+
for page in pages:
|
28 |
+
lines.extend(page.extract_text().split('\n'))
|
29 |
+
else:
|
30 |
+
lines = content.split('\n')
|
31 |
header = lines[0]
|
32 |
+
columns = len(header.split('|')) - 1
|
33 |
rows = len(lines) - 3
|
34 |
return f"π Dataset Structure: {columns} columns, {rows} data samples"
|
35 |
except:
|
|
|
55 |
try:
|
56 |
file_ext = os.path.splitext(file.name)[1].lower()
|
57 |
|
58 |
+
if file_ext in ['.parquet', '.pdf']:
|
59 |
+
if file_ext == '.parquet':
|
60 |
+
df = pd.read_parquet(file.name, engine='pyarrow')
|
61 |
+
else:
|
62 |
+
df = pd.read_csv(file.name, encoding='utf-8', engine='python') # Use 'python' engine to handle PDF files
|
63 |
content = df.head(10).to_markdown(index=False)
|
64 |
+
return content, file_ext
|
65 |
elif file_ext == '.csv':
|
66 |
+
df = pd.read_csv(file.name)
|
67 |
+
content = f"π Data Preview:\n{df.head(10).to_markdown(index=False)}\n\n"
|
68 |
+
content += f"\nπ Data Information:\n"
|
69 |
+
content += f"- Total Rows: {len(df)}\n"
|
70 |
+
content += f"- Total Columns: {len(df.columns)}\n"
|
71 |
+
content += f"- Column List: {', '.join(df.columns)}\n"
|
72 |
+
content += f"\nπ Column Data Types:\n"
|
73 |
+
for col, dtype in df.dtypes.items():
|
74 |
+
content += f"- {col}: {dtype}\n"
|
75 |
+
null_counts = df.isnull().sum()
|
76 |
+
if null_counts.any():
|
77 |
+
content += f"\nβ οΈ Missing Values:\n"
|
78 |
+
for col, null_count in null_counts[null_counts > 0].items():
|
79 |
+
content += f"- {col}: {null_count} missing\n"
|
80 |
+
return content, file_ext
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
else:
|
82 |
encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
|
83 |
for encoding in encodings:
|
84 |
try:
|
85 |
with open(file.name, 'r', encoding=encoding) as f:
|
86 |
content = f.read()
|
87 |
+
return content, file_ext
|
88 |
except UnicodeDecodeError:
|
89 |
continue
|
90 |
raise UnicodeDecodeError(f"β Unable to read file with supported encodings ({', '.join(encodings)})")
|
|
|
101 |
|
102 |
def chat(message, history, uploaded_file, model_name, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
|
103 |
system_prefix = """You are a file analysis expert. Analyze the uploaded file in depth from the following perspectives:
|
104 |
+
1. π Overall structure and composition
|
105 |
+
2. π Key content and pattern analysis
|
106 |
+
3. π Data characteristics and meaning
|
107 |
+
- For datasets: Column meanings, data types, value distributions
|
108 |
+
- For text/code: Structural features, main patterns
|
109 |
+
4. π‘ Potential applications
|
110 |
+
5. β¨ Data quality and areas for improvement
|
111 |
+
Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. Format the analysis results in Markdown and include specific examples where possible."""
|
112 |
|
113 |
if uploaded_file:
|
114 |
content, file_type = read_uploaded_file(uploaded_file)
|
115 |
if file_type == "error":
|
116 |
+
return "", [{"role": "user", "content": message}, {"role": "assistant", "content": content}]
|
|
|
117 |
|
118 |
file_summary = analyze_file_content(content, file_type)
|
119 |
|
120 |
+
if file_type in ['parquet', 'csv', 'pdf']:
|
121 |
system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
|
122 |
else:
|
123 |
system_message += f"\n\nFile Content:\n```\n{content}\n```"
|
124 |
|
125 |
if message == "Starting file analysis...":
|
126 |
+
message = f"""[ꡬ쑰 λΆμ] {file_summary}
|
127 |
+
μμΈν λΆμν΄μ£ΌμΈμ:
|
128 |
+
1. π μ 체 ꡬ쑰 λ° νμ
|
129 |
+
2. π μ£Όμ λ΄μ© λ° κ΅¬μ±μμ λΆμ
|
130 |
+
3. π λ°μ΄ν°/λ΄μ©μ νΉμ± λ° ν¨ν΄
|
131 |
+
4. β νμ§ λ° μμ μ± νκ°
|
132 |
+
5. π‘ μ μνλ κ°μ μ
|
133 |
+
6. π― μ€μ©μ μΈ νμ© λ° κΆμ₯μ¬ν"""
|
134 |
|
135 |
messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
|
136 |
+
|
137 |
+
# Convert history to message format
|
138 |
+
if history is not None:
|
139 |
+
for item in history:
|
140 |
+
if isinstance(item, dict):
|
141 |
+
messages.append(item)
|
142 |
+
elif isinstance(item, (list, tuple)) and len(item) == 2:
|
143 |
+
messages.append({"role": "user", "content": item[0]})
|
144 |
+
if item[1]:
|
145 |
+
messages.append({"role": "assistant", "content": item[1]})
|
146 |
+
|
147 |
messages.append({"role": "user", "content": message})
|
148 |
|
149 |
try:
|
150 |
client = get_client(model_name)
|
151 |
partial_message = ""
|
152 |
+
current_history = []
|
153 |
|
154 |
for msg in client.chat_completion(
|
155 |
messages,
|
|
|
161 |
token = msg.choices[0].delta.get('content', None)
|
162 |
if token:
|
163 |
partial_message += token
|
164 |
+
current_history = [
|
165 |
+
{"role": "user", "content": message},
|
166 |
+
{"role": "assistant", "content": partial_message}
|
167 |
+
]
|
168 |
+
yield "", current_history
|
169 |
|
170 |
except Exception as e:
|
171 |
error_msg = f"β Inference error: {str(e)}"
|
172 |
+
error_history = [
|
173 |
+
{"role": "user", "content": message},
|
174 |
+
{"role": "assistant", "content": error_msg}
|
175 |
+
]
|
176 |
+
yield "", error_history
|
177 |
|
178 |
css = """
|
179 |
footer {visibility: hidden}
|
|
|
195 |
with gr.Column(scale=2):
|
196 |
chatbot = gr.Chatbot(
|
197 |
height=600,
|
198 |
+
label="μ±ν
μΈν°νμ΄μ€ π¬",
|
199 |
+
type="messages"
|
200 |
)
|
201 |
msg = gr.Textbox(
|
202 |
+
label="λ©μμ§λ₯Ό μ
λ ₯νμΈμ",
|
203 |
show_label=False,
|
204 |
+
placeholder="μ
λ‘λλ νμΌμ λν΄ λ¬Όμ΄λ³΄μΈμ... π",
|
205 |
container=False
|
206 |
)
|
207 |
+
send = gr.Button("μ μ‘ π€")
|
|
|
|
|
208 |
|
209 |
with gr.Column(scale=1):
|
210 |
model_name = gr.Radio(
|
211 |
choices=list(LLM_MODELS.keys()),
|
212 |
value="Cohere c4ai-crp-08-2024",
|
213 |
+
label="LLM λͺ¨λΈ μ ν π€",
|
214 |
+
info="μ νΈνλ AI λͺ¨λΈμ μ ννμΈμ"
|
215 |
)
|
216 |
|
217 |
+
gr.Markdown("### νμΌ μ
λ‘λ π\nμ§μ: ν
μ€νΈ, μ½λ, CSV, Parquet, PDF νμΌ")
|
218 |
file_upload = gr.File(
|
219 |
+
label="νμΌ μ
λ‘λ",
|
220 |
+
file_types=["text", ".csv", ".parquet", ".pdf"],
|
|
|
221 |
type="filepath"
|
222 |
)
|
223 |
|
224 |
+
with gr.Accordion("κ³ κΈ μ€μ βοΈ", open=False):
|
225 |
+
system_message = gr.Textbox(label="μμ€ν
λ©μμ§ π", value="")
|
226 |
+
max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="μ΅λ ν ν° π")
|
227 |
+
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="μ¨λ π‘οΈ")
|
228 |
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P π")
|
229 |
|
230 |
# Event bindings
|
|
|
239 |
[msg]
|
240 |
)
|
241 |
|
242 |
+
send.click(
|
243 |
chat,
|
244 |
inputs=[msg, chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
|
245 |
outputs=[msg, chatbot],
|
|
|
253 |
# Auto-analysis on file upload
|
254 |
file_upload.change(
|
255 |
chat,
|
256 |
+
inputs=[gr.Textbox(value="νμΌ λΆμ μμ..."), chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
|
257 |
outputs=[msg, chatbot],
|
258 |
queue=True
|
259 |
)
|
|
|
261 |
# Example queries
|
262 |
gr.Examples(
|
263 |
examples=[
|
264 |
+
["νμΌμ μ 체 ꡬ쑰μ νΉμ§μ μμΈν μ€λͺ
ν΄μ£ΌμΈμ π"],
|
265 |
+
["νμΌμ μ£Όμ ν¨ν΄κ³Ό νΉμ±μ λΆμν΄μ£ΌμΈμ π"],
|
266 |
+
["νμΌμ νμ§κ³Ό κ°μ μ μ νκ°ν΄μ£ΌμΈμ π‘"],
|
267 |
+
["μ΄ νμΌμ μ΄λ»κ² μ€μ©μ μΌλ‘ νμ©ν μ μμκΉμ? π―"],
|
268 |
+
["μ£Όμ λ΄μ©μ μμ½νκ³ ν΅μ¬ ν΅μ°°λ ₯μ λμΆν΄μ£ΌμΈμ β¨"],
|
269 |
+
["λ μμΈν λΆμμ κ³μν΄μ£ΌμΈμ π"],
|
270 |
],
|
271 |
inputs=msg,
|
272 |
)
|