Spaces:

rapacious
/

NLTK

Running

File size: 9,525 Bytes

import gradio as gr
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet, brown, movie_reviews
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk, ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.classify import NaiveBayesClassifier
import random

# Tải các tài nguyên cần thiết
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')  # Thêm dòng này
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('movie_reviews')

# Khởi tạo các công cụ
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Hàm huấn luyện classifier
def train_classifier():
    pos_files = movie_reviews.fileids('pos')[:50]
    neg_files = movie_reviews.fileids('neg')[:50]
    pos_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'positive') for fileid in pos_files]
    neg_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'negative') for fileid in neg_files]
    train_set = pos_reviews + neg_reviews
    random.shuffle(train_set)
    return NaiveBayesClassifier.train(train_set)

classifier = train_classifier()

# Hàm xử lý cho từng chức năng
def process_text(input_text, function):
    if not input_text:
        return "Vui lòng nhập văn bản!"
    
    if function == "Sentence Tokenization":
        return "\n".join(sent_tokenize(input_text))
    elif function == "Word Tokenization":
        return "\n".join(word_tokenize(input_text))
    elif function == "Part-of-Speech Tagging":
        words = word_tokenize(input_text)
        return "\n".join([f"{word}: {tag}" for word, tag in pos_tag(words)])
    elif function == "Stemming":
        words = word_tokenize(input_text)
        return "\n".join([stemmer.stem(word) for word in words])
    elif function == "Lemmatization":
        words = word_tokenize(input_text)
        return "\n".join([lemmatizer.lemmatize(word) for word in words])
    elif function == "Remove Stop Words":
        words = word_tokenize(input_text)
        return "\n".join([word for word in words if word.lower() not in stop_words])
    elif function == "Named Entity Recognition":
        words = word_tokenize(input_text)
        pos_tags = pos_tag(words)
        entities = ne_chunk(pos_tags)
        return str(entities)
    elif function == "Text Classification":
        words = word_tokenize(input_text)
        result = classifier.classify({word: True for word in words})
        return f"Sentiment: {result}"
    elif function == "N-grams (Bigrams)":
        words = word_tokenize(input_text)
        bigrams = list(ngrams(words, 2))
        return "\n".join([f"{w1} - {w2}" for w1, w2 in bigrams])
    elif function == "Collocations":
        words = word_tokenize(input_text)
        finder = BigramCollocationFinder.from_words(words)
        collocations = finder.nbest(nltk.collocations.BigramAssocMeasures().pmi, 5)
        return "\n".join([f"{w1} - {w2}" for w1, w2 in collocations])
    elif function == "WordNet Synsets":
        words = word_tokenize(input_text)
        first_word = words[0] if words else ""
        synsets = wordnet.synsets(first_word)
        if synsets:
            return f"Definition: {synsets[0].definition()}\nExamples: {synsets[0].examples()}"
        return "Không tìm thấy từ trong WordNet!"
    elif function == "Sample from Brown Corpus":
        return " ".join(brown.words()[:50])
    return "Chức năng chưa được triển khai!"

# Tạo giao diện Gradio với các tab
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Công cụ xử lý ngôn ngữ tự nhiên với NLTK
        Chọn tab và nhập văn bản để khám phá các tính năng!
        """
    )
    
    with gr.Tabs():
        # Tab 1: Tokenization
        with gr.TabItem("Tokenization"):
            with gr.Row():
                token_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: I love coding.", lines=5)
                token_dropdown = gr.Dropdown(
                    label="Chọn chức năng",
                    choices=["Sentence Tokenization", "Word Tokenization"],
                    value="Sentence Tokenization"
                )
            token_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
            token_btn = gr.Button("Xử lý", variant="primary")
            gr.Markdown(
                """
                ### Hướng dẫn:
                - **Sentence Tokenization:** Tách văn bản thành các câu riêng biệt.
                - **Word Tokenization:** Tách văn bản thành các từ riêng lẻ.
                """
            )
            token_btn.click(fn=process_text, inputs=[token_input, token_dropdown], outputs=token_output)

        # Tab 2: Morphology
        with gr.TabItem("Morphology"):
            with gr.Row():
                morph_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: Running is fun.", lines=5)
                morph_dropdown = gr.Dropdown(
                    label="Chọn chức năng",
                    choices=["Stemming", "Lemmatization", "Remove Stop Words"],
                    value="Stemming"
                )
            morph_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
            morph_btn = gr.Button("Xử lý", variant="primary")
            gr.Markdown(
                """
                ### Hướng dẫn:
                - **Stemming:** Rút gọn từ về dạng gốc thô (VD: 'running' → 'run').
                - **Lemmatization:** Rút gọn từ về dạng gốc có nghĩa (VD: 'better' → 'good').
                - **Remove Stop Words:** Loại bỏ từ dừng như 'the', 'is' (chỉ hỗ trợ tiếng Anh).
                """
            )
            morph_btn.click(fn=process_text, inputs=[morph_input, morph_dropdown], outputs=morph_output)

        # Tab 3: Syntax & Semantics
        with gr.TabItem("Syntax & Semantics"):
            with gr.Row():
                syntax_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: Apple is in California.", lines=5)
                syntax_dropdown = gr.Dropdown(
                    label="Chọn chức năng",
                    choices=["Part-of-Speech Tagging", "Named Entity Recognition", "WordNet Synsets"],
                    value="Part-of-Speech Tagging"
                )
            syntax_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
            syntax_btn = gr.Button("Xử lý", variant="primary")
            gr.Markdown(
                """
                ### Hướng dẫn:
                - **Part-of-Speech Tagging:** Gắn nhãn từ loại (VD: danh từ, động từ).
                - **Named Entity Recognition:** Nhận diện thực thể (VD: tên người, địa điểm).
                - **WordNet Synsets:** Tra cứu định nghĩa và ví dụ từ WordNet (cho từ đầu tiên).
                """
            )
            syntax_btn.click(fn=process_text, inputs=[syntax_input, syntax_dropdown], outputs=syntax_output)

        # Tab 4: Text Analysis
        with gr.TabItem("Text Analysis"):
            with gr.Row():
                analysis_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: I love this movie!", lines=5)
                analysis_dropdown = gr.Dropdown(
                    label="Chọn chức năng",
                    choices=["Text Classification", "N-grams (Bigrams)", "Collocations"],
                    value="Text Classification"
                )
            analysis_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
            analysis_btn = gr.Button("Xử lý", variant="primary")
            gr.Markdown(
                """
                ### Hướng dẫn:
                - **Text Classification:** Phân loại cảm xúc (tích cực/tiêu cực).
                - **N-grams (Bigrams):** Tạo các cặp từ liên tiếp.
                - **Collocations:** Tìm các cặp từ thường xuất hiện cùng nhau.
                """
            )
            analysis_btn.click(fn=process_text, inputs=[analysis_input, analysis_dropdown], outputs=analysis_output)

        # Tab 5: Corpus
        with gr.TabItem("Corpus"):
            with gr.Row():
                corpus_input = gr.Textbox(label="Nhập văn bản (không cần thiết)", placeholder="Để trống cũng được", lines=5)
                corpus_dropdown = gr.Dropdown(
                    label="Chọn chức năng",
                    choices=["Sample from Brown Corpus"],
                    value="Sample from Brown Corpus"
                )
            corpus_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
            corpus_btn = gr.Button("Xử lý", variant="primary")
            gr.Markdown(
                """
                ### Hướng dẫn:
                - **Sample from Brown Corpus:** Lấy mẫu 50 từ từ Brown Corpus bất kể văn bản nhập vào.
                """
            )
            corpus_btn.click(fn=process_text, inputs=[corpus_input, corpus_dropdown], outputs=corpus_output)

# Chạy giao diện
demo.launch()