openfree commited on
Commit
a352e50
Β·
verified Β·
1 Parent(s): 0dfd273

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -75
app.py CHANGED
@@ -2,12 +2,13 @@ import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import os
4
  import pandas as pd
 
5
  from typing import List, Tuple
6
 
7
  # LLM Models Definition
8
  LLM_MODELS = {
9
  "Cohere c4ai-crp-08-2024": "CohereForAI/c4ai-command-r-plus-08-2024", # Default
10
- "Meta Llama3.3-70B": "meta-llama/Llama-3.3-70B-Instruct",
11
  "Mistral Nemo 2407": "mistralai/Mistral-Nemo-Instruct-2407",
12
  "Alibaba Qwen QwQ-32B": "Qwen/QwQ-32B-Preview"
13
  }
@@ -17,11 +18,18 @@ def get_client(model_name):
17
 
18
  def analyze_file_content(content, file_type):
19
  """Analyze file content and return structural summary"""
20
- if file_type in ['parquet', 'csv']:
21
  try:
22
- lines = content.split('\n')
 
 
 
 
 
 
 
23
  header = lines[0]
24
- columns = header.count('|') - 1
25
  rows = len(lines) - 3
26
  return f"πŸ“Š Dataset Structure: {columns} columns, {rows} data samples"
27
  except:
@@ -47,39 +55,36 @@ def read_uploaded_file(file):
47
  try:
48
  file_ext = os.path.splitext(file.name)[1].lower()
49
 
50
- if file_ext == '.parquet':
51
- df = pd.read_parquet(file.name, engine='pyarrow')
 
 
 
52
  content = df.head(10).to_markdown(index=False)
53
- return content, "parquet"
54
  elif file_ext == '.csv':
55
- encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
56
- for encoding in encodings:
57
- try:
58
- df = pd.read_csv(file.name, encoding=encoding)
59
- content = f"πŸ“Š Data Preview:\n{df.head(10).to_markdown(index=False)}\n\n"
60
- content += f"\nπŸ“ˆ Data Information:\n"
61
- content += f"- Total Rows: {len(df)}\n"
62
- content += f"- Total Columns: {len(df.columns)}\n"
63
- content += f"- Column List: {', '.join(df.columns)}\n"
64
- content += f"\nπŸ“‹ Column Data Types:\n"
65
- for col, dtype in df.dtypes.items():
66
- content += f"- {col}: {dtype}\n"
67
- null_counts = df.isnull().sum()
68
- if null_counts.any():
69
- content += f"\n⚠️ Missing Values:\n"
70
- for col, null_count in null_counts[null_counts > 0].items():
71
- content += f"- {col}: {null_count} missing\n"
72
- return content, "csv"
73
- except UnicodeDecodeError:
74
- continue
75
- raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
76
  else:
77
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
78
  for encoding in encodings:
79
  try:
80
  with open(file.name, 'r', encoding=encoding) as f:
81
  content = f.read()
82
- return content, "text"
83
  except UnicodeDecodeError:
84
  continue
85
  raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
@@ -96,45 +101,55 @@ def format_history(history):
96
 
97
  def chat(message, history, uploaded_file, model_name, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
98
  system_prefix = """You are a file analysis expert. Analyze the uploaded file in depth from the following perspectives:
99
- 1. πŸ“‹ Overall structure and composition
100
- 2. πŸ“Š Key content and pattern analysis
101
- 3. πŸ“ˆ Data characteristics and meaning
102
- - For datasets: Column meanings, data types, value distributions
103
- - For text/code: Structural features, main patterns
104
- 4. πŸ’‘ Potential applications
105
- 5. ✨ Data quality and areas for improvement
106
- Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. Format the analysis results in Markdown and include specific examples where possible."""
107
 
108
  if uploaded_file:
109
  content, file_type = read_uploaded_file(uploaded_file)
110
  if file_type == "error":
111
- yield "", history + [[message, content]]
112
- return
113
 
114
  file_summary = analyze_file_content(content, file_type)
115
 
116
- if file_type in ['parquet', 'csv']:
117
  system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
118
  else:
119
  system_message += f"\n\nFile Content:\n```\n{content}\n```"
120
 
121
  if message == "Starting file analysis...":
122
- message = f"""[Structure Analysis] {file_summary}
123
- Please provide detailed analysis from these perspectives:
124
- 1. πŸ“‹ Overall file structure and format
125
- 2. πŸ“Š Key content and component analysis
126
- 3. πŸ“ˆ Data/content characteristics and patterns
127
- 4. ⭐ Quality and completeness evaluation
128
- 5. πŸ’‘ Suggested improvements
129
- 6. 🎯 Practical applications and recommendations"""
130
 
131
  messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
132
- messages.extend(format_history(history))
 
 
 
 
 
 
 
 
 
 
133
  messages.append({"role": "user", "content": message})
134
 
135
  try:
136
  client = get_client(model_name)
137
  partial_message = ""
 
138
 
139
  for msg in client.chat_completion(
140
  messages,
@@ -146,11 +161,19 @@ Please provide detailed analysis from these perspectives:
146
  token = msg.choices[0].delta.get('content', None)
147
  if token:
148
  partial_message += token
149
- yield "", history + [[message, partial_message]]
 
 
 
 
150
 
151
  except Exception as e:
152
  error_msg = f"❌ Inference error: {str(e)}"
153
- yield "", history + [[message, error_msg]]
 
 
 
 
154
 
155
  css = """
156
  footer {visibility: hidden}
@@ -172,38 +195,36 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
172
  with gr.Column(scale=2):
173
  chatbot = gr.Chatbot(
174
  height=600,
175
- label="Chat Interface πŸ’¬",
176
- type="messages" # κ²½κ³  해결을 μœ„ν•΄ type 지정
177
  )
178
  msg = gr.Textbox(
179
- label="Type your message",
180
  show_label=False,
181
- placeholder="Ask me anything about the uploaded file... πŸ’­",
182
  container=False
183
  )
184
- with gr.Row():
185
- clear = gr.ClearButton([msg, chatbot]) # label οΏ½οΏ½κ±°
186
- send = gr.Button("Send πŸ“€")
187
 
188
  with gr.Column(scale=1):
189
  model_name = gr.Radio(
190
  choices=list(LLM_MODELS.keys()),
191
  value="Cohere c4ai-crp-08-2024",
192
- label="Select LLM Model πŸ€–",
193
- info="Choose your preferred AI model"
194
  )
195
 
 
196
  file_upload = gr.File(
197
- label="Upload File πŸ“",
198
- info="Support: Text, Code, CSV, Parquet files",
199
- file_types=["text", ".csv", ".parquet"],
200
  type="filepath"
201
  )
202
 
203
- with gr.Accordion("Advanced Settings βš™οΈ", open=False):
204
- system_message = gr.Textbox(label="System Message πŸ“", value="")
205
- max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens πŸ“Š")
206
- temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature 🌑️")
207
  top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P πŸ“ˆ")
208
 
209
  # Event bindings
@@ -218,7 +239,7 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
218
  [msg]
219
  )
220
 
221
- send.click( # 전솑 λ²„νŠΌ 이벀트 μΆ”κ°€
222
  chat,
223
  inputs=[msg, chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
224
  outputs=[msg, chatbot],
@@ -232,7 +253,7 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
232
  # Auto-analysis on file upload
233
  file_upload.change(
234
  chat,
235
- inputs=[gr.Textbox(value="Starting file analysis..."), chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
236
  outputs=[msg, chatbot],
237
  queue=True
238
  )
@@ -240,12 +261,12 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat
240
  # Example queries
241
  gr.Examples(
242
  examples=[
243
- ["Please explain the overall structure and features of the file in detail πŸ“‹"],
244
- ["Analyze the main patterns and characteristics of this file πŸ“Š"],
245
- ["Evaluate the file's quality and potential improvements πŸ’‘"],
246
- ["How can we practically utilize this file? 🎯"],
247
- ["Summarize the main content and derive key insights ✨"],
248
- ["Please continue with more detailed analysis πŸ“ˆ"],
249
  ],
250
  inputs=msg,
251
  )
 
2
  from huggingface_hub import InferenceClient
3
  import os
4
  import pandas as pd
5
+ import pdfplumber
6
  from typing import List, Tuple
7
 
8
  # LLM Models Definition
9
  LLM_MODELS = {
10
  "Cohere c4ai-crp-08-2024": "CohereForAI/c4ai-command-r-plus-08-2024", # Default
11
+ "Meta Llama3.3-70B": "meta-llama/Llama-3.3-70B-Instruct",
12
  "Mistral Nemo 2407": "mistralai/Mistral-Nemo-Instruct-2407",
13
  "Alibaba Qwen QwQ-32B": "Qwen/QwQ-32B-Preview"
14
  }
 
18
 
19
  def analyze_file_content(content, file_type):
20
  """Analyze file content and return structural summary"""
21
+ if file_type in ['parquet', 'csv', 'pdf']:
22
  try:
23
+ if file_type == 'pdf':
24
+ with pdfplumber.open(content) as pdf:
25
+ pages = pdf.pages
26
+ lines = []
27
+ for page in pages:
28
+ lines.extend(page.extract_text().split('\n'))
29
+ else:
30
+ lines = content.split('\n')
31
  header = lines[0]
32
+ columns = len(header.split('|')) - 1
33
  rows = len(lines) - 3
34
  return f"πŸ“Š Dataset Structure: {columns} columns, {rows} data samples"
35
  except:
 
55
  try:
56
  file_ext = os.path.splitext(file.name)[1].lower()
57
 
58
+ if file_ext in ['.parquet', '.pdf']:
59
+ if file_ext == '.parquet':
60
+ df = pd.read_parquet(file.name, engine='pyarrow')
61
+ else:
62
+ df = pd.read_csv(file.name, encoding='utf-8', engine='python') # Use 'python' engine to handle PDF files
63
  content = df.head(10).to_markdown(index=False)
64
+ return content, file_ext
65
  elif file_ext == '.csv':
66
+ df = pd.read_csv(file.name)
67
+ content = f"πŸ“Š Data Preview:\n{df.head(10).to_markdown(index=False)}\n\n"
68
+ content += f"\nπŸ“ˆ Data Information:\n"
69
+ content += f"- Total Rows: {len(df)}\n"
70
+ content += f"- Total Columns: {len(df.columns)}\n"
71
+ content += f"- Column List: {', '.join(df.columns)}\n"
72
+ content += f"\nπŸ“‹ Column Data Types:\n"
73
+ for col, dtype in df.dtypes.items():
74
+ content += f"- {col}: {dtype}\n"
75
+ null_counts = df.isnull().sum()
76
+ if null_counts.any():
77
+ content += f"\n⚠️ Missing Values:\n"
78
+ for col, null_count in null_counts[null_counts > 0].items():
79
+ content += f"- {col}: {null_count} missing\n"
80
+ return content, file_ext
 
 
 
 
 
 
81
  else:
82
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
83
  for encoding in encodings:
84
  try:
85
  with open(file.name, 'r', encoding=encoding) as f:
86
  content = f.read()
87
+ return content, file_ext
88
  except UnicodeDecodeError:
89
  continue
90
  raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
 
101
 
102
  def chat(message, history, uploaded_file, model_name, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
103
  system_prefix = """You are a file analysis expert. Analyze the uploaded file in depth from the following perspectives:
104
+ 1. πŸ“‹ Overall structure and composition
105
+ 2. πŸ“Š Key content and pattern analysis
106
+ 3. πŸ“ˆ Data characteristics and meaning
107
+ - For datasets: Column meanings, data types, value distributions
108
+ - For text/code: Structural features, main patterns
109
+ 4. πŸ’‘ Potential applications
110
+ 5. ✨ Data quality and areas for improvement
111
+ Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. Format the analysis results in Markdown and include specific examples where possible."""
112
 
113
  if uploaded_file:
114
  content, file_type = read_uploaded_file(uploaded_file)
115
  if file_type == "error":
116
+ return "", [{"role": "user", "content": message}, {"role": "assistant", "content": content}]
 
117
 
118
  file_summary = analyze_file_content(content, file_type)
119
 
120
+ if file_type in ['parquet', 'csv', 'pdf']:
121
  system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
122
  else:
123
  system_message += f"\n\nFile Content:\n```\n{content}\n```"
124
 
125
  if message == "Starting file analysis...":
126
+ message = f"""[ꡬ쑰 뢄석] {file_summary}
127
+ μžμ„Ένžˆ λΆ„μ„ν•΄μ£Όμ„Έμš”:
128
+ 1. πŸ“‹ 전체 ꡬ쑰 및 ν˜•μ‹
129
+ 2. πŸ“Š μ£Όμš” λ‚΄μš© 및 κ΅¬μ„±μš”μ†Œ 뢄석
130
+ 3. πŸ“ˆ 데이터/λ‚΄μš©μ˜ νŠΉμ„± 및 νŒ¨ν„΄
131
+ 4. ⭐ ν’ˆμ§ˆ 및 μ™„μ „μ„± 평가
132
+ 5. πŸ’‘ μ œμ•ˆν•˜λŠ” κ°œμ„ μ 
133
+ 6. 🎯 μ‹€μš©μ μΈ ν™œμš© 및 ꢌμž₯사항"""
134
 
135
  messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
136
+
137
+ # Convert history to message format
138
+ if history is not None:
139
+ for item in history:
140
+ if isinstance(item, dict):
141
+ messages.append(item)
142
+ elif isinstance(item, (list, tuple)) and len(item) == 2:
143
+ messages.append({"role": "user", "content": item[0]})
144
+ if item[1]:
145
+ messages.append({"role": "assistant", "content": item[1]})
146
+
147
  messages.append({"role": "user", "content": message})
148
 
149
  try:
150
  client = get_client(model_name)
151
  partial_message = ""
152
+ current_history = []
153
 
154
  for msg in client.chat_completion(
155
  messages,
 
161
  token = msg.choices[0].delta.get('content', None)
162
  if token:
163
  partial_message += token
164
+ current_history = [
165
+ {"role": "user", "content": message},
166
+ {"role": "assistant", "content": partial_message}
167
+ ]
168
+ yield "", current_history
169
 
170
  except Exception as e:
171
  error_msg = f"❌ Inference error: {str(e)}"
172
+ error_history = [
173
+ {"role": "user", "content": message},
174
+ {"role": "assistant", "content": error_msg}
175
+ ]
176
+ yield "", error_history
177
 
178
  css = """
179
  footer {visibility: hidden}
 
195
  with gr.Column(scale=2):
196
  chatbot = gr.Chatbot(
197
  height=600,
198
+ label="μ±„νŒ… μΈν„°νŽ˜μ΄μŠ€ πŸ’¬",
199
+ type="messages"
200
  )
201
  msg = gr.Textbox(
202
+ label="λ©”μ‹œμ§€λ₯Ό μž…λ ₯ν•˜μ„Έμš”",
203
  show_label=False,
204
+ placeholder="μ—…λ‘œλ“œλœ νŒŒμΌμ— λŒ€ν•΄ λ¬Όμ–΄λ³΄μ„Έμš”... πŸ’­",
205
  container=False
206
  )
207
+ send = gr.Button("전솑 πŸ“€")
 
 
208
 
209
  with gr.Column(scale=1):
210
  model_name = gr.Radio(
211
  choices=list(LLM_MODELS.keys()),
212
  value="Cohere c4ai-crp-08-2024",
213
+ label="LLM λͺ¨λΈ 선택 πŸ€–",
214
+ info="μ„ ν˜Έν•˜λŠ” AI λͺ¨λΈμ„ μ„ νƒν•˜μ„Έμš”"
215
  )
216
 
217
+ gr.Markdown("### 파일 μ—…λ‘œλ“œ πŸ“\n지원: ν…μŠ€νŠΈ, μ½”λ“œ, CSV, Parquet, PDF 파일")
218
  file_upload = gr.File(
219
+ label="파일 μ—…λ‘œλ“œ",
220
+ file_types=["text", ".csv", ".parquet", ".pdf"],
 
221
  type="filepath"
222
  )
223
 
224
+ with gr.Accordion("κ³ κΈ‰ μ„€μ • βš™οΈ", open=False):
225
+ system_message = gr.Textbox(label="μ‹œμŠ€ν…œ λ©”μ‹œμ§€ πŸ“", value="")
226
+ max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="μ΅œλŒ€ 토큰 πŸ“Š")
227
+ temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="μ˜¨λ„ 🌑️")
228
  top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P πŸ“ˆ")
229
 
230
  # Event bindings
 
239
  [msg]
240
  )
241
 
242
+ send.click(
243
  chat,
244
  inputs=[msg, chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
245
  outputs=[msg, chatbot],
 
253
  # Auto-analysis on file upload
254
  file_upload.change(
255
  chat,
256
+ inputs=[gr.Textbox(value="파일 뢄석 μ‹œμž‘..."), chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
257
  outputs=[msg, chatbot],
258
  queue=True
259
  )
 
261
  # Example queries
262
  gr.Examples(
263
  examples=[
264
+ ["파일의 전체 ꡬ쑰와 νŠΉμ§•μ„ μžμ„Ένžˆ μ„€λͺ…ν•΄μ£Όμ„Έμš” πŸ“‹"],
265
+ ["파일의 μ£Όμš” νŒ¨ν„΄κ³Ό νŠΉμ„±μ„ λΆ„μ„ν•΄μ£Όμ„Έμš” πŸ“Š"],
266
+ ["파일의 ν’ˆμ§ˆκ³Ό κ°œμ„ μ μ„ ν‰κ°€ν•΄μ£Όμ„Έμš” πŸ’‘"],
267
+ ["이 νŒŒμΌμ„ μ–΄λ–»κ²Œ μ‹€μš©μ μœΌλ‘œ ν™œμš©ν•  수 μžˆμ„κΉŒμš”? 🎯"],
268
+ ["μ£Όμš” λ‚΄μš©μ„ μš”μ•½ν•˜κ³  핡심 톡찰λ ₯을 λ„μΆœν•΄μ£Όμ„Έμš” ✨"],
269
+ ["더 μžμ„Έν•œ 뢄석을 κ³„μ†ν•΄μ£Όμ„Έμš” πŸ“ˆ"],
270
  ],
271
  inputs=msg,
272
  )