File size: 13,877 Bytes
71a08c8
 
 
 
 
81f0d23
 
 
71a08c8
 
 
 
81f0d23
71a08c8
 
 
 
 
 
 
81f0d23
71a08c8
 
81f0d23
 
 
 
 
 
71a08c8
 
81f0d23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71a08c8
 
 
 
81f0d23
 
71a08c8
 
 
 
 
 
 
 
 
 
 
81f0d23
71a08c8
81f0d23
71a08c8
81f0d23
a61644e
81f0d23
88c17a0
81f0d23
 
 
 
 
 
 
 
 
df0d042
81f0d23
71a08c8
a61644e
81f0d23
71a08c8
 
81f0d23
71a08c8
 
 
 
 
81f0d23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71a08c8
81f0d23
71a08c8
 
 
81f0d23
 
 
 
 
 
 
 
 
 
71a08c8
81f0d23
 
 
 
71a08c8
81f0d23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df0d042
81f0d23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df0d042
81f0d23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71a08c8
 
 
 
 
81f0d23
71a08c8
81f0d23
71a08c8
 
 
81f0d23
 
71a08c8
 
 
 
 
 
 
 
 
81f0d23
71a08c8
81f0d23
 
 
 
 
71a08c8
 
 
 
 
 
 
 
81f0d23
 
71a08c8
 
 
 
 
a61644e
71a08c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import gradio as gr
import os
import uuid
import threading
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Global model cache
MODEL_CACHE = {
    "model": None,
    "tokenizer": None,
    "init_lock": threading.Lock()
}

# Create directories for user data
os.makedirs("user_data", exist_ok=True)

def initialize_model_once():
    """Initialize Phi-4-mini model once"""
    with MODEL_CACHE["init_lock"]:
        if MODEL_CACHE["model"] is None:
            # Load Phi-4-mini model
            MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")
            MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                "microsoft/Phi-4-mini-instruct", 
                torch_dtype=torch.float16,
                device_map="auto"
            )
    
    return MODEL_CACHE["model"], MODEL_CACHE["tokenizer"]

def generate_pandas_code(prompt, max_new_tokens=512):
    """Generate Python code using the Phi-4-mini model"""
    model, tokenizer = initialize_model_once()
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the generated part, removing the input prompt
    generated_text = response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
    
    # Extract code between ```python and ``` if present
    import re
    code_match = re.search(r'```python\s*(.*?)\s*```', generated_text, re.DOTALL)
    if code_match:
        return code_match.group(1).strip()
    else:
        # Return the raw generated text as fallback
        return generated_text.strip()

class ChatBot:
    def __init__(self, session_id):
        self.session_id = session_id
        self.csv_info = None
        self.df = None
        self.chat_history = []
        self.user_dir = f"user_data/{session_id}"
        os.makedirs(self.user_dir, exist_ok=True)
        
    def process_file(self, file):
        if file is None:
            return "Mohon upload file CSV terlebih dahulu."
            
        try:
            # Handle file from Gradio
            file_path = file.name if hasattr(file, 'name') else str(file)
            file_name = os.path.basename(file_path)
            
            # Load and save CSV directly with pandas
            try:
                self.df = pd.read_csv(file_path)
                user_file_path = f"{self.user_dir}/uploaded.csv"
                self.df.to_csv(user_file_path, index=False)
                
                # Store CSV info
                self.csv_info = {
                    "filename": file_name,
                    "rows": self.df.shape[0],
                    "columns": self.df.shape[1],
                    "column_names": self.df.columns.tolist(),
                }
                
                print(f"CSV verified: {self.df.shape[0]} rows, {len(self.df.columns)} columns")
            except Exception as e:
                return f"Error membaca CSV: {str(e)}"
            
            # Add file info to chat history
            file_info = f"CSV berhasil dimuat: {file_name} dengan {self.df.shape[0]} baris dan {len(self.df.columns)} kolom. Kolom: {', '.join(self.df.columns.tolist())}"
            self.chat_history.append(("System", file_info))
            
            return f"File CSV '{file_name}' berhasil diproses! Anda dapat mulai mengajukan pertanyaan tentang data."
        except Exception as e:
            import traceback
            print(traceback.format_exc())
            return f"Error pemrosesan file: {str(e)}"

    def execute_query(self, code):
        """Safely execute pandas code"""
        try:
            # Create local context with the dataframe
            local_vars = {"df": self.df, "pd": pd, "np": np}
            
            # Execute code with timeout
            exec(code, {"pd": pd, "np": np}, local_vars)
            
            # Get result
            if "result" in local_vars:
                return local_vars["result"]
            else:
                # If no result variable, find the last variable created
                last_var = None
                for var_name, var_value in local_vars.items():
                    if var_name not in ["df", "pd", "np"] and var_name != "__builtins__":
                        last_var = var_value
                
                if last_var is not None:
                    return last_var
                else:
                    return self.df  # Return the dataframe as default
        except Exception as e:
            raise Exception(f"Gagal menjalankan kode: {str(e)}")

    def chat(self, message, history):
        if self.df is None:
            return "Mohon upload file CSV terlebih dahulu."
        
        try:
            # Handle common metadata questions directly to save resources
            message_lower = message.lower()
            if "nama file" in message_lower:
                return f"Nama file CSV adalah: {self.csv_info['filename']}"
            elif "nama kolom" in message_lower:
                return f"Kolom dalam CSV: {', '.join(self.csv_info['column_names'])}"
            elif "jumlah baris" in message_lower or "berapa baris" in message_lower:
                return f"Jumlah baris dalam CSV: {self.csv_info['rows']}"
            elif "jumlah kolom" in message_lower or "berapa kolom" in message_lower:
                return f"Jumlah kolom dalam CSV: {self.csv_info['columns']}"
            
            # Get sample data for context
            sample_df = self.df.head(5)
            sample_str = sample_df.to_string()
            data_types = {col: str(dtype) for col, dtype in self.df.dtypes.items()}
            
            # Create prompt for LLM
            prompt = f"""
            You are a data analyst that translates natural language questions into Python pandas code.

            DataFrame information:
            - Column names: {', '.join(self.csv_info['column_names'])}
            - Data types: {data_types}
            - Number of rows: {self.csv_info['rows']}
            - Sample data:
            {sample_str}

            User question: {message}

            Write a short Python code using pandas to answer the user's question. 
            The code must use the 'df' variable as the DataFrame name.
            The code should assign the final result to a variable named 'result'.
            Only return the Python code without any explanation.

            ```python
            """
            
            # Generate code with Phi-4
            try:
                code = generate_pandas_code(prompt)
                
                # Add result variable if not present
                if not any(line.strip().startswith("result =") for line in code.split("\n")):
                    if code.startswith("df."):
                        code = "result = " + code
                    elif not "result" in code:
                        code = "result = " + code
            except Exception as e:
                print(f"Error generating code: {str(e)}")
                # Fallback for basic questions
                if "rata-rata" in message_lower or "mean" in message_lower:
                    code = "result = df.describe()"
                elif "jumlah" in message_lower or "count" in message_lower:
                    code = "result = df.count()"
                else:
                    return f"Maaf, saya tidak dapat menghasilkan kode untuk pertanyaan ini. Error: {str(e)}"
            
            # Execute the code and get result
            try:
                print(f"Executing code: {code}")
                result = self.execute_query(code)
                
                # Check if result is relevant to the question
                if result is None or (isinstance(result, pd.DataFrame) and result.empty):
                    return "Maaf, kita tidak bisa mendapatkan informasi terkait pertanyaan anda di dalam file CSV anda."
                
                # Format result based on its type
                if isinstance(result, pd.DataFrame):
                    if len(result) > 5:
                        result_str = result.head(5).to_string() + f"\n\n[Total {len(result)} baris]"
                    else:
                        result_str = result.to_string()
                elif isinstance(result, (pd.Series, np.ndarray)):
                    if len(result) > 10:
                        result_str = str(result[:10]) + f"\n\n[Total {len(result)} item]"
                    else:
                        result_str = str(result)
                elif hasattr(result, "__len__") and not isinstance(result, (str, int, float)):
                    result_str = str(result)
                    if len(result) > 0:
                        result_str += f"\n\n[Total {len(result)} item]"
                else:
                    result_str = str(result)
                
                # Format response
                response = f"Hasil analisis:\n\n{result_str}\n\nKode yang dijalankan:\n```python\n{code}\n```"
                
                self.chat_history.append((message, response))
                return response
                
            except Exception as e:
                return f"Error saat menganalisis data: {str(e)}\n\nKode yang dicoba:\n```python\n{code}\n```"
                
        except Exception as e:
            import traceback
            print(traceback.format_exc())
            return f"Error: {str(e)}"

# UI Code (sama seperti sebelumnya)
def create_gradio_interface():
    with gr.Blocks(title="CSV Data Analyzer") as interface:
        session_id = gr.State(lambda: str(uuid.uuid4()))
        chatbot_state = gr.State(lambda: None)
        
        gr.HTML("<h1 style='text-align: center;'>CSV Data Analyzer</h1>")
        gr.HTML("<h3 style='text-align: center;'>Ajukan pertanyaan tentang data CSV Anda</h3>")
        
        with gr.Row():
            with gr.Column(scale=1):
                file_input = gr.File(
                    label="Upload CSV Anda",
                    file_types=[".csv"]
                )
                process_button = gr.Button("Proses CSV")
                
                with gr.Accordion("Contoh Pertanyaan", open=False):
                    gr.Markdown("""
                    - "Berapa jumlah data yang memiliki nilai Glucose di atas 150?"
                    - "Hitung nilai rata-rata setiap kolom numerik"
                    - "Berapa banyak data untuk setiap kelompok dalam kolom Outcome?"
                    - "Berapa jumlah baris dalam dataset ini?"
                    - "Berapa jumlah kolom dalam dataset ini?"
                    """)
            
            with gr.Column(scale=2):
                chatbot_interface = gr.Chatbot(
                    label="Riwayat Chat",
                    height=400
                )
                message_input = gr.Textbox(
                    label="Ketik pertanyaan Anda",
                    placeholder="Contoh: Berapa jumlah data yang memiliki nilai Glucose di atas 150?",
                    lines=2
                )
                submit_button = gr.Button("Kirim")
                clear_button = gr.Button("Bersihkan Chat")
        
        # Handler functions
        def handle_process_file(file, sess_id):
            chatbot = ChatBot(sess_id)
            result = chatbot.process_file(file)
            return chatbot, [(None, result)]
            
        process_button.click(
            fn=handle_process_file,
            inputs=[file_input, session_id],
            outputs=[chatbot_state, chatbot_interface]
        )
        
        def user_message_submitted(message, history, chatbot, sess_id):
            history = history + [(message, None)]
            return history, "", chatbot, sess_id
        
        def bot_response(history, chatbot, sess_id):
            if chatbot is None:
                chatbot = ChatBot(sess_id)
                history[-1] = (history[-1][0], "Mohon upload file CSV terlebih dahulu.")
                return chatbot, history
            
            user_message = history[-1][0]
            response = chatbot.chat(user_message, history[:-1])
            history[-1] = (user_message, response)
            return chatbot, history
        
        submit_button.click(
            fn=user_message_submitted,
            inputs=[message_input, chatbot_interface, chatbot_state, session_id],
            outputs=[chatbot_interface, message_input, chatbot_state, session_id]
        ).then(
            fn=bot_response,
            inputs=[chatbot_interface, chatbot_state, session_id],
            outputs=[chatbot_state, chatbot_interface]
        )
        
        message_input.submit(
            fn=user_message_submitted,
            inputs=[message_input, chatbot_interface, chatbot_state, session_id],
            outputs=[chatbot_interface, message_input, chatbot_state, session_id]
        ).then(
            fn=bot_response,
            inputs=[chatbot_interface, chatbot_state, session_id],
            outputs=[chatbot_state, chatbot_interface]
        )
        
        def handle_clear_chat(chatbot):
            if chatbot is not None:
                chatbot.chat_history = []
            return chatbot, []
            
        clear_button.click(
            fn=handle_clear_chat,
            inputs=[chatbot_state],
            outputs=[chatbot_state, chatbot_interface]
        )
        
    return interface

# Launch the interface
if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(share=True)