|
import gradio as gr |
|
from huggingface_hub import InferenceClient |
|
import os |
|
import pandas as pd |
|
from typing import List, Dict, Tuple |
|
import json |
|
import io |
|
import traceback |
|
import csv |
|
|
|
|
|
hf_client = InferenceClient( |
|
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN") |
|
) |
|
|
|
def load_code(filename: str) -> str: |
|
try: |
|
with open(filename, 'r', encoding='utf-8') as file: |
|
return file.read() |
|
except FileNotFoundError: |
|
return f"{filename} νμΌμ μ°Ύμ μ μμ΅λλ€." |
|
except Exception as e: |
|
return f"νμΌμ μ½λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |
|
|
|
def load_parquet(filename: str) -> str: |
|
try: |
|
df = pd.read_parquet(filename, engine='pyarrow') |
|
return df.head(10).to_markdown(index=False) |
|
except FileNotFoundError: |
|
return f"{filename} νμΌμ μ°Ύμ μ μμ΅λλ€." |
|
except Exception as e: |
|
return f"νμΌμ μ½λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |
|
|
|
def respond( |
|
message: str, |
|
history: List[Dict[str, str]], |
|
system_message: str = "", |
|
max_tokens: int = 4000, |
|
temperature: float = 0.5, |
|
top_p: float = 0.9, |
|
parquet_data: str = None |
|
) -> str: |
|
|
|
if parquet_data: |
|
system_prefix = """λ°λμ νκΈλ‘ λ΅λ³ν κ². λλ μ
λ‘λλ λ°μ΄ν°λ₯Ό κΈ°λ°μΌλ‘ μ§λ¬Έμ λ΅λ³νλ μν μ νλ€. λ°μ΄ν°λ₯Ό λΆμνμ¬ μ¬μ©μμκ² λμμ΄ λλ μ 보λ₯Ό μ 곡νλΌ. λ°μ΄ν°λ₯Ό νμ©νμ¬ μμΈνκ³ μ νν λ΅λ³μ μ 곡νλ, λ―Όκ°ν μ 보λ κ°μΈ μ 보λ₯Ό λ
ΈμΆνμ§ λ§λΌ.""" |
|
try: |
|
df = pd.read_json(io.StringIO(parquet_data)) |
|
|
|
data_summary = df.describe(include='all').to_string() |
|
system_prefix += f"\n\nμ
λ‘λλ λ°μ΄ν°μ μμ½ μ 보:\n{data_summary}" |
|
except Exception as e: |
|
print(f"λ°μ΄ν° λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}\n{traceback.format_exc()}") |
|
system_prefix += "\n\nλ°μ΄ν°λ₯Ό λ‘λνλ μ€ μ€λ₯κ° λ°μνμ΅λλ€." |
|
else: |
|
system_prefix = system_message or "λλ AI μ‘°μΈμ μν μ΄λ€." |
|
|
|
|
|
prompt = system_prefix + "\n\n" |
|
for chat in history: |
|
if chat['role'] == 'user': |
|
prompt += f"μ¬μ©μ: {chat['content']}\n" |
|
else: |
|
prompt += f"AI: {chat['content']}\n" |
|
prompt += f"μ¬μ©μ: {message}\nAI:" |
|
|
|
try: |
|
|
|
response = "" |
|
stream = hf_client.text_generation( |
|
prompt=prompt, |
|
max_new_tokens=max_tokens, |
|
stream=True, |
|
temperature=temperature, |
|
top_p=top_p, |
|
) |
|
for msg in stream: |
|
if msg: |
|
response += msg |
|
yield response |
|
except Exception as e: |
|
error_message = f"μΆλ‘ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}\n{traceback.format_exc()}" |
|
print(error_message) |
|
yield error_message |
|
|
|
def upload_csv(file_path: str) -> Tuple[str, str]: |
|
try: |
|
|
|
df = pd.read_csv(file_path, sep=',') |
|
|
|
required_columns = {'id', 'text', 'label', 'metadata'} |
|
available_columns = set(df.columns) |
|
missing_columns = required_columns - available_columns |
|
if missing_columns: |
|
return f"CSV νμΌμ λ€μ νμ 컬λΌμ΄ λλ½λμμ΅λλ€: {', '.join(missing_columns)}", "" |
|
|
|
df.drop_duplicates(inplace=True) |
|
df.fillna('', inplace=True) |
|
|
|
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'}) |
|
|
|
parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet' |
|
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy') |
|
return f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλκ³ λ³νλμμ΅λλ€.", parquet_filename |
|
except Exception as e: |
|
return f"CSV νμΌ μ
λ‘λ λ° λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "" |
|
|
|
def upload_parquet(file_path: str) -> Tuple[str, str, str]: |
|
try: |
|
|
|
df = pd.read_parquet(file_path, engine='pyarrow') |
|
|
|
parquet_content = df.head(10).to_markdown(index=False) |
|
|
|
parquet_json = df.to_json(orient='records', force_ascii=False) |
|
return "Parquet νμΌμ΄ μ±κ³΅μ μΌλ‘ μ
λ‘λλμμ΅λλ€.", parquet_content, parquet_json |
|
except Exception as e: |
|
return f"Parquet νμΌ μ
λ‘λ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}", "", "" |
|
|
|
def text_to_parquet(text: str) -> Tuple[str, str, str]: |
|
try: |
|
from io import StringIO |
|
|
|
csv_data = StringIO(text) |
|
df = pd.read_csv( |
|
csv_data, |
|
sep=',', |
|
dtype=str, |
|
quoting=csv.QUOTE_ALL, |
|
escapechar='\\', |
|
engine='python', |
|
header=None, |
|
names=['id', 'text', 'label', 'metadata'] |
|
) |
|
|
|
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'}) |
|
|
|
parquet_filename = 'text_to_parquet.parquet' |
|
df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy') |
|
|
|
parquet_content = load_parquet(parquet_filename) |
|
return f"{parquet_filename} νμΌμ΄ μ±κ³΅μ μΌλ‘ λ³νλμμ΅λλ€.", parquet_content, parquet_filename |
|
except Exception as e: |
|
error_message = f"ν
μ€νΈ λ³ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}\n{traceback.format_exc()}" |
|
print(error_message) |
|
return error_message, "", "" |
|
|
|
def preprocess_text_with_llm(input_text: str) -> str: |
|
|
|
system_prompt = """λΉμ μ λ°μ΄ν° μ μ²λ¦¬ μ λ¬Έκ°μ
λλ€. μ
λ ₯λ κΈ΄ ν
μ€νΈλ₯Ό μλμ κ°μ λ°μ΄ν°μ
νμμΌλ‘ μ μ²λ¦¬νμΈμ: |
|
|
|
- **λ°μ΄ν°μ
νμ:** `id,text,label,metadata` |
|
- **κ° νμ μλ‘μ΄ μ€λ‘ ꡬλΆλκ³ **, νλλ μΌνλ‘ κ΅¬λΆλ©λλ€. |
|
- **ν
μ€νΈλ λ€λ₯Έ νλ λ΄μ μΌνκ° μμ κ²½μ°**, ν΄λΉ νλλ₯Ό ν°λ°μ΄ν(")λ‘ κ°μΈμΈμ. |
|
- **νλ λ΄μ ν°λ°μ΄νκ° μμ κ²½μ°**, λ°±μ¬λμ(\\)λ‘ μ΄μ€μΌμ΄ν μ²λ¦¬νμΈμ. μ: \\" |
|
- ν
μ€νΈλ₯Ό **μλ―Έ λ¨μλ‘ λΆν **νκ³ , κ° λ¬Έμ₯μ λν΄ **1λΆν° μμνλ μ°μλ id**λ₯Ό λΆμ¬νμΈμ. |
|
- κ° λ¬Έμ₯μ λν΄ **μ μ ν label(μΉ΄ν
κ³ λ¦¬)**μ μ§μ νμΈμ. μ: "κΈ°μ ", "μ¬ν", "κ²½μ " |
|
- **metadata**μλ μΆμ²λ λ μ§ λ±μ μΆκ° μ 보λ₯Ό ν¬ν¨νμΈμ. |
|
- μ΅μ’
κ²°κ³Όλ **κ° νμ΄ `id,text,label,metadata` νμμ CSV**κ° λλλ‘ νμΈμ. |
|
|
|
**μμ:** |
|
|
|
μ
λ ₯ ν
μ€νΈ: |
|
|
|
"μ€λμ λ μ¨κ° μ’λ€. λ΄μΌμ λΉκ° μ¬ μμ μ΄λ€." |
|
|
|
μ μ²λ¦¬λ λ°μ΄ν°μ
: |
|
1,"μ€λμ λ μ¨κ° μ’λ€.","λ μ¨","2023-10-05" |
|
2,"λ΄μΌμ λΉκ° μ¬ μμ μ΄λ€.","λ μ¨","2023-10-05" |
|
|
|
**μ΄μ μλμ μ
λ ₯ ν
μ€νΈλ₯Ό μ²λ¦¬νμΈμ:** |
|
|
|
""" + input_text |
|
|
|
|
|
try: |
|
response = "" |
|
stream = hf_client.text_generation( |
|
prompt=system_prompt, |
|
max_new_tokens=2000, |
|
temperature=0.5, |
|
top_p=0.9, |
|
stream=True, |
|
) |
|
for msg in stream: |
|
if msg: |
|
response += msg |
|
|
|
print("LLM μλ΅:\n", response) |
|
processed_text = response.strip() |
|
return processed_text |
|
except Exception as e: |
|
error_message = f"μ μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}\n{traceback.format_exc()}" |
|
print(error_message) |
|
return error_message |
|
|
|
|
|
|
|
css = """ |
|
footer { |
|
visibility: hidden; |
|
} |
|
#chatbot-container, #chatbot-data-upload { |
|
height: 700px; |
|
overflow-y: scroll; |
|
} |
|
#chatbot-container .message, #chatbot-data-upload .message { |
|
font-size: 14px; |
|
} |
|
/* μ
λ ₯μ°½ λ°°κ²½μ λ° κΈμμ λ³κ²½ */ |
|
textarea, input[type="text"] { |
|
background-color: #ffffff; /* ν°μ λ°°κ²½ */ |
|
color: #000000; /* κ²μ μ κΈμ */ |
|
} |
|
/* νμΌ μ
λ‘λ μμ λμ΄ μ‘°μ */ |
|
#parquet-upload-area { |
|
max-height: 150px; |
|
overflow-y: auto; |
|
} |
|
/* μ΄κΈ° μ€λͺ
κΈμ¨ ν¬κΈ° μ‘°μ */ |
|
#initial-description { |
|
font-size: 14px; |
|
} |
|
""" |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.Markdown("# My RAG: LLMμ΄ λλ§μ λ°μ΄ν°λ‘ νμ΅ν μ½ν
μΈ μμ±/λ΅λ³", elem_id="initial-description") |
|
gr.Markdown( |
|
"### 1) λλ§μ λ°μ΄ν°λ₯Ό μ
λ ₯ λλ CSV μ
λ‘λλ‘ Parquet λ°μ΄ν°μ
μλ λ³ν 2) Parquet λ°μ΄ν°μ
μ μ
λ‘λνλ©΄, LLMμ΄ λ§μΆ€ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬ μλ΅\n" |
|
"### Tip) 'μμ 'λ₯Ό ν΅ν΄ λ€μν νμ© λ°©λ²μ 체ννκ³ μμ©ν΄ 보μΈμ, λ°μ΄ν°μ
μ
λ‘λμ 미리보기λ 10κ±΄λ§ μΆλ ₯", |
|
elem_id="initial-description" |
|
) |
|
|
|
|
|
with gr.Tab("My λ°μ΄ν°μ
+LLM"): |
|
gr.Markdown("### LLMκ³Ό λννκΈ°") |
|
chatbot_data_upload = gr.Chatbot(label="μ±λ΄", type="messages", elem_id="chatbot-data-upload") |
|
msg_data_upload = gr.Textbox(label="λ©μμ§ μ
λ ₯", placeholder="μ¬κΈ°μ λ©μμ§λ₯Ό μ
λ ₯νμΈμ...") |
|
send_data_upload = gr.Button("μ μ‘") |
|
|
|
with gr.Accordion("μμ€ν
ν둬ννΈ λ° μ΅μ
μ€μ ", open=False): |
|
system_message = gr.Textbox(label="System Message", value="λλ AI μ‘°μΈμ μν μ΄λ€.") |
|
max_tokens = gr.Slider(minimum=1, maximum=8000, value=1000, label="Max Tokens") |
|
temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature") |
|
top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P") |
|
|
|
parquet_data_state = gr.State() |
|
|
|
def handle_message_data_upload( |
|
message: str, |
|
history: List[Dict[str, str]], |
|
system_message: str, |
|
max_tokens: int, |
|
temperature: float, |
|
top_p: float, |
|
parquet_data: str |
|
): |
|
history = history or [] |
|
try: |
|
|
|
history.append({"role": "user", "content": message}) |
|
|
|
response_gen = respond( |
|
message, history, system_message, max_tokens, temperature, top_p, parquet_data |
|
) |
|
partial_response = "" |
|
for partial in response_gen: |
|
partial_response = partial |
|
|
|
display_history = history + [ |
|
{"role": "assistant", "content": partial_response} |
|
] |
|
yield display_history, "" |
|
|
|
history.append({"role": "assistant", "content": partial_response}) |
|
except Exception as e: |
|
response = f"μΆλ‘ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |
|
history.append({"role": "assistant", "content": response}) |
|
yield history, "" |
|
|
|
send_data_upload.click( |
|
handle_message_data_upload, |
|
inputs=[ msg_data_upload, chatbot_data_upload, system_message, max_tokens, temperature, top_p, parquet_data_state, |
|
outputs=[chatbot_data_upload, msg_data_upload], |
|
queue=True |
|
) |
|
|
|
|
|
with gr.Accordion("μμ ", open=False): |
|
gr.Examples( |
|
examples=[ |
|
["μ
λ‘λλ λ°μ΄ν°μ
μ λν΄ μμ½ μ€λͺ
νλΌ."], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, λ³Έ μλΉμ€λ₯Ό SEO μ΅μ ννμ¬ λΈλ‘κ·Έ ν¬μ€νΈ(κ°μ, λ°°κ²½ λ° νμμ±, κΈ°μ‘΄ μ μ¬ μ ν/μλΉμ€μ λΉκ΅νμ¬ νΉμ₯μ , νμ©μ², κ°μΉ, κΈ°λν¨κ³Ό, κ²°λ‘ μ ν¬ν¨)λ‘ 4000 ν ν° μ΄μ μμ±νλΌ"], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, μ¬μ© λ°©λ²κ³Ό μ°¨λ³μ , νΉμ§, κ°μ μ μ€μ¬μΌλ‘ 4000 ν ν° μ΄μ μ νλΈ μμ μ€ν¬λ¦½νΈ ννλ‘ μμ±νλΌ"], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, μ ν μμΈ νμ΄μ§ νμμ λ΄μ©μ 4000 ν ν° μ΄μ μμΈν μ€λͺ
νλΌ"], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, FAQ 20건μ μμΈνκ² μμ±νλΌ. 4000ν ν° μ΄μ μ¬μ©νλΌ."], |
|
["μ
λ‘λλ λ°μ΄ν°μ
νμΌμ νμ΅ λ°μ΄ν°λ‘ νμ©νμ¬, νΉν μΆμμ νμ©ν κΈ°μ λ° λΉμ¦λμ€ λͺ¨λΈ μΈ‘λ©΄μ ν¬ν¨νμ¬ νΉν μΆμμ ꡬμ±μ λ§κ² νμ μ μΈ μ°½μ λ°λͺ
λ΄μ©μ μ€μ¬μΌλ‘ 4000 ν ν° μ΄μ μμ±νλΌ."], |
|
], |
|
inputs=msg_data_upload, |
|
label="μμ μ ν", |
|
) |
|
|
|
|
|
gr.Markdown("### Parquet νμΌ μ
λ‘λ") |
|
with gr.Row(): |
|
with gr.Column(): |
|
parquet_upload = gr.File( |
|
label="Parquet νμΌ μ
λ‘λ", type="filepath", elem_id="parquet-upload-area" |
|
) |
|
parquet_upload_button = gr.Button("μ
λ‘λ") |
|
parquet_upload_status = gr.Textbox(label="μ
λ‘λ μν", interactive=False) |
|
parquet_preview_chat = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°") |
|
|
|
def handle_parquet_upload(file_path: str): |
|
message, parquet_content, parquet_json = upload_parquet(file_path) |
|
if parquet_json: |
|
return message, parquet_content, parquet_json |
|
else: |
|
return message, "", "" |
|
|
|
parquet_upload_button.click( |
|
handle_parquet_upload, |
|
inputs=parquet_upload, |
|
outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state] |
|
) |
|
|
|
|
|
with gr.Tab("CSV to My λ°μ΄ν°μ
"): |
|
gr.Markdown("### CSV νμΌ μ
λ‘λ λ° Parquet λ³ν") |
|
with gr.Row(): |
|
with gr.Column(): |
|
csv_file = gr.File(label="CSV νμΌ μ
λ‘λ", type="filepath") |
|
upload_button = gr.Button("μ
λ‘λ λ° λ³ν") |
|
upload_status = gr.Textbox(label="μ
λ‘λ μν", interactive=False) |
|
parquet_preview = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°") |
|
download_button = gr.File(label="Parquet νμΌ λ€μ΄λ‘λ", interactive=False) |
|
|
|
def handle_csv_upload(file_path: str): |
|
message, parquet_filename = upload_csv(file_path) |
|
if parquet_filename: |
|
parquet_content = load_parquet(parquet_filename) |
|
return message, parquet_content, parquet_filename |
|
else: |
|
return message, "", None |
|
|
|
upload_button.click( |
|
handle_csv_upload, |
|
inputs=csv_file, |
|
outputs=[upload_status, parquet_preview, download_button] |
|
) |
|
|
|
|
|
with gr.Tab("Text to My λ°μ΄ν°μ
"): |
|
gr.Markdown("### ν
μ€νΈλ₯Ό μ
λ ₯νλ©΄ CSVλ‘ λ³ν ν ParquetμΌλ‘ μλ μ νλ©λλ€.") |
|
with gr.Row(): |
|
with gr.Column(): |
|
text_input = gr.Textbox( |
|
label="ν
μ€νΈ μ
λ ₯ (κ° νμ `id,text,label,metadata` νμμΌλ‘ μ
λ ₯)", |
|
lines=10, |
|
placeholder='μ: 1,"μ΄μμ ","μ₯κ΅°","κ±°λΆμ "\n2,"μκ· ","μ₯κ΅°","λͺ¨ν¨"\n3,"μ μ‘°","μ","μκΈ°"\n4,"λμν λ―Έ νλ°μμ","μ","μΉ¨λ΅"' |
|
) |
|
convert_button = gr.Button("λ³ν λ° λ€μ΄λ‘λ") |
|
convert_status = gr.Textbox(label="λ³ν μν", interactive=False) |
|
parquet_preview_convert = gr.Markdown(label="Parquet νμΌ λ―Έλ¦¬λ³΄κΈ°") |
|
download_parquet_convert = gr.File(label="Parquet νμΌ λ€μ΄λ‘λ", interactive=False) |
|
|
|
def handle_text_to_parquet(text: str): |
|
message, parquet_content, parquet_filename = text_to_parquet(text) |
|
if parquet_filename: |
|
return message, parquet_content, parquet_filename |
|
else: |
|
return message, "", None |
|
|
|
convert_button.click( |
|
handle_text_to_parquet, |
|
inputs=text_input, |
|
outputs=[convert_status, parquet_preview_convert, download_parquet_convert] |
|
) |
|
|
|
|
|
with gr.Tab("Text Preprocessing with LLM"): |
|
gr.Markdown("### ν
μ€νΈλ₯Ό μ
λ ₯νλ©΄ LLMμ΄ λ°μ΄ν°μ
νμμ λ§κ² μ μ²λ¦¬νμ¬ μΆλ ₯ν©λλ€.") |
|
with gr.Row(): |
|
with gr.Column(): |
|
raw_text_input = gr.Textbox( |
|
label="ν
μ€νΈ μ
λ ₯", |
|
lines=15, |
|
placeholder="μ¬κΈ°μ μ μ²λ¦¬ν ν
μ€νΈλ₯Ό μ
λ ₯νμΈμ..." |
|
) |
|
preprocess_button = gr.Button("μ μ²λ¦¬ μ€ν") |
|
preprocess_status = gr.Textbox(label="μ μ²λ¦¬ μν", interactive=False) |
|
processed_text_output = gr.Textbox( |
|
label="μ μ²λ¦¬λ λ°μ΄ν°μ
μΆλ ₯", |
|
lines=15, |
|
interactive=False |
|
) |
|
|
|
def handle_text_preprocessing(input_text: str): |
|
preprocess_status.value = "μ μ²λ¦¬ μ€μ
λλ€. μ μλ§ κΈ°λ€λ €μ£ΌμΈμ..." |
|
processed_text = preprocess_text_with_llm(input_text) |
|
preprocess_status.value = "μ μ²λ¦¬κ° μλ£λμμ΅λλ€." |
|
return preprocess_status.value, processed_text |
|
|
|
preprocess_button.click( |
|
handle_text_preprocessing, |
|
inputs=raw_text_input, |
|
outputs=[preprocess_status, processed_text_output] |
|
) |
|
|
|
gr.Markdown("### [email protected]", elem_id="initial-description") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|