word2vec

Sleeping

File size: 6,038 Bytes

770e786
aca3b1d
9fcb29b
770e786
 
 
 
aca3b1d
 
9fcb29b
 
1ed0340
 
9fcb29b
aca3b1d
770e786
 
 
 
9fcb29b
aca3b1d
 
 
 
 
 
 
9fcb29b
aca3b1d
9fcb29b
aca3b1d
770e786
 
aca3b1d
9fcb29b
 
 
 
1ed0340
 
9fcb29b
 
 
7112f5e
770e786
 
aca3b1d
770e786
 
aca3b1d
770e786
7112f5e
aca3b1d
770e786
 
 
 
 
7112f5e
aca3b1d
 
 
 
7112f5e
 
 
 
9fcb29b
aca3b1d
 
7112f5e
9fcb29b
1ed0340
770e786
 
 
 
 
 
 
 
 
7112f5e
770e786
 
 
aca3b1d
770e786
 
 
7112f5e
 
 
770e786
1ed0340
 
770e786
aca3b1d
770e786
 
 
1ed0340
aca3b1d
1ed0340
7112f5e
1ed0340
7112f5e
 
770e786
0131a54
aca3b1d
9fcb29b
 
770e786
1ed0340
8c42cdb
770e786
9fcb29b
1ed0340
 
 
 
aca3b1d
9fcb29b
 
 
aca3b1d
9fcb29b
1ed0340
 
 
aca3b1d
770e786
1ed0340
 
aca3b1d
1ed0340
 
 
0131a54
1ed0340
 
 
 
 
 
 
 
770e786
 
1ed0340

import numpy as np
import pandas as pd
import random
from gensim.models import Word2Vec
import gradio as gr
from sklearn.decomposition import PCA
import plotly.graph_objects as go


# Word2Vec 모델 학습 함수
def train_word2vec(sentences):
    # model = Word2Vec(sentences, vector_size=100, window=4, min_count=6, workers=4, sg=0, epochs=100)
    model = Word2Vec(sentences, vector_size=50, window=4, min_count=1, sg=0, epochs=100)
    return model

def apply_pca(word_vectors):
    pca = PCA(n_components=3)
    return pca.fit_transform(word_vectors)


def get_unique(model):
    vocablist1=list(model.wv.index_to_key)
    vocablist =[]
    for i in vocablist1:
        vocablist.append(i)
    return vocablist

def train_model(sentence):
    # 전처리
    sentences=sentence

    # Word2Vec 모델 학습
    model = train_word2vec(sentences)
    unique_words = get_unique(model)
    
    return  model, unique_words

def process_model(target_word):
    target_word =target_word.lower()        #################
    
    # Word2Vec 모델 로드
    model = Word2Vec.load("word2vec.model")
    unique_words = get_unique(model)

    # 각 단어의 임베딩 벡터 추출
    word_vectors = np.array([model.wv[word] for word in unique_words])

    # PCA로 차원 축소
    word_vectors_3d = apply_pca(word_vectors)

    # 색상 설정 (투명도 추가)
    colors = ['rgba(255, 255, 255, 0.15)' if word != target_word else 'rgba(255, 20, 147, 0.9)' for word in unique_words]

    # 가장 가까운 단어 10개 찾기
    if target_word in model.wv:
        similar_words = model.wv.most_similar(target_word, topn=10)
        similar_word_indices = [unique_words.index(word) for word, _ in similar_words]
        for idx in similar_word_indices:
            colors[idx] = 'rgba(255, 165, 0, 1)'  # 가까운 단어들을 주황색으로 표시

    # 가장 먼 단어 10개 찾기
    if target_word in model.wv:
        all_words = model.wv.index_to_key  # 모델에 포함된 모든 단어 리스트
        dissimilar_words = sorted(
            [(word, model.wv.similarity(target_word, word)) for word in all_words if word != target_word],
            key=lambda x: x[1]
        )[:10]  # 유사도가 가장 낮은 10개 단어 선택

        dissimilar_word_indices = [unique_words.index(word) for word, _ in dissimilar_words]
        for idx in dissimilar_word_indices:
            colors[idx] = 'rgba(138, 43, 226, 0.8)'  # 가장 먼 단어들을 보라색으로 표시

            
    # Plotly를 사용한 3D 산점도 생성
    fig = go.Figure(data=[go.Scatter3d(
        x=word_vectors_3d[:, 0],
        y=word_vectors_3d[:, 1],
        z=word_vectors_3d[:, 2],
        mode='markers+text',
        text=unique_words,
        textposition="top center",
        marker=dict(
            size=4,
            color=colors,
        )
    )])

    fig.update_layout(
        title="Word Embeddings 3D Visualization",
        scene=dict(
            xaxis_title="X",
            yaxis_title="Y",
            zaxis_title="Z"
        ),
        width=1100,
        height=900
    )

    # 가장 가까운 단어 10개 목록 생성
    similar_words_text = ""
    if target_word in model.wv:
        similar_words_text = "\n".join([f"{word}: {score:.4f}" for word, score in similar_words])

    dissimlar_words_Text=""
    if target_word in model.wv:
        dissimilar_words_text = "\n".join([f"{word}: {score:.4f}" for word, score in dissimilar_words])

    return fig, similar_words_text, dissimilar_words_text



# Gradio 인터페이스 수정
with gr.Blocks(css=".plot-box {width: 70%; height: 500px;}") as iface:
    gr.Markdown("# Word Embedding 3D 시각화")
    gr.Markdown("<Inside Out 2> 단어 의미 지도(임베딩 벡터) 3D 시각화 도구")

    with gr.Row():
        # 사용자 입력 박스를 강조하기 위해 스타일을 변경
        with gr.Column():
            word_input = gr.Textbox(label="**단어 입력**", elem_id="input-box", placeholder="ex. emotion, puberty, hockey, friend, anxiety, memory, ...", lines=1)
            submit_btn = gr.Button("제출", elem_id="submit-btn")
        bulletin = gr.Textbox(label="사용법 안내", interactive=False, lines=4,  value="1. 소설에 나온 단어를 입력하고 [제출] 버튼이나 [Enter]를 누르세요 \n2.  입력 단어는 빨간색, 가까운 단어들은 주황색, 먼 단어들은 보라색으로 강조됩니다. \n3.  <Error>가 나타나는 경우, 다른 단어를 입력해보세요.\n4.  마우스 드래그 및 스크롤을 활용하여 3D 화면을 살펴보세요. \n5.  단어 입력창에 다른 단어들도 입력해보세요.")

    with gr.Row():
        # 시각화 화면의 크기를 CSS로 증가
        plot_output = gr.Plot(label="Word Embedding 3D 시각화", elem_id="plot-box")

        with gr.Column(scale=0.3):  # 컬럼의 너비를 줄이기 위해 scale 값을 낮춤
            similar_words_output = gr.Textbox(label="가장 가까운 단어 10개", interactive=False, lines=5)
            dissimilar_words_output = gr.Textbox(label="가장 먼 단어 10개", interactive=False, lines=5)
            gr.Image(value="https://compote.slate.com/images/8324cd2e-21f5-4b20-84d5-f08ece97ac38.jpeg?crop=1560%2C1040%2Cx0%2Cy0&width=1280", label="URL 이미지", interactive=False)

    submit_btn.click(
        fn=process_text,
        # word_input = word_input.lower(),
        inputs=[word_input],
        outputs=[plot_output, similar_words_output, dissimilar_words_output],
        # preprocess=lambda word: word.lower() if word else ""  # None 체크 후 소문자 변환
        )

        # "Enter" 키 입력 시 동작 설정
    word_input.submit(
        fn=process_text,
        # word_input = word_input.lower(),
        inputs=[word_input],
        outputs=[plot_output, similar_words_output, dissimilar_words_output],
        preprocess=lambda word: word.lower() if word else ""  # None 체크 후 소문자 변환
    )

if __name__ == "__main__":
    iface.launch()