|
import gradio as gr |
|
import nltk |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
from nltk.corpus import stopwords, wordnet, brown |
|
from nltk.stem import PorterStemmer, WordNetLemmatizer |
|
from nltk import pos_tag, ne_chunk, ngrams |
|
from nltk.collocations import BigramCollocationFinder |
|
from nltk.classify import NaiveBayesClassifier |
|
from nltk.corpus import movie_reviews |
|
import random |
|
|
|
|
|
nltk.download('punkt') |
|
nltk.download('punkt_tab') |
|
nltk.download('averaged_perceptron_tagger') |
|
nltk.download('maxent_ne_chunker') |
|
nltk.download('words') |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
nltk.download('brown') |
|
nltk.download('movie_reviews') |
|
|
|
|
|
stemmer = PorterStemmer() |
|
lemmatizer = WordNetLemmatizer() |
|
stop_words = set(stopwords.words('english')) |
|
|
|
|
|
def train_classifier(): |
|
pos_reviews = [({"word": word}, 'positive') for word in movie_reviews.words('pos')[:100]] |
|
neg_reviews = [({"word": word}, 'negative') for word in movie_reviews.words('neg')[:100]] |
|
train_set = pos_reviews + neg_reviews |
|
random.shuffle(train_set) |
|
return NaiveBayesClassifier.train(train_set) |
|
|
|
classifier = train_classifier() |
|
|
|
|
|
def nlp_tool(input_text, function): |
|
if not input_text: |
|
return "Vui lòng nhập văn bản!" |
|
|
|
if function == "Sentence Tokenization": |
|
return "\n".join(sent_tokenize(input_text)) |
|
|
|
elif function == "Word Tokenization": |
|
return "\n".join(word_tokenize(input_text)) |
|
|
|
elif function == "Part-of-Speech Tagging": |
|
words = word_tokenize(input_text) |
|
return "\n".join([f"{word}: {tag}" for word, tag in pos_tag(words)]) |
|
|
|
elif function == "Stemming": |
|
words = word_tokenize(input_text) |
|
return "\n".join([stemmer.stem(word) for word in words]) |
|
|
|
elif function == "Lemmatization": |
|
words = word_tokenize(input_text) |
|
return "\n".join([lemmatizer.lemmatize(word) for word in words]) |
|
|
|
elif function == "Remove Stop Words": |
|
words = word_tokenize(input_text) |
|
return "\n".join([word for word in words if word.lower() not in stop_words]) |
|
|
|
elif function == "Named Entity Recognition": |
|
words = word_tokenize(input_text) |
|
pos_tags = pos_tag(words) |
|
entities = ne_chunk(pos_tags) |
|
return str(entities) |
|
|
|
elif function == "Text Classification": |
|
words = word_tokenize(input_text) |
|
result = classifier.classify({word: True for word in words}) |
|
return f"Sentiment: {result}" |
|
|
|
elif function == "N-grams (Bigrams)": |
|
words = word_tokenize(input_text) |
|
bigrams = list(ngrams(words, 2)) |
|
return "\n".join([f"{w1} - {w2}" for w1, w2 in bigrams]) |
|
|
|
elif function == "Collocations": |
|
words = word_tokenize(input_text) |
|
finder = BigramCollocationFinder.from_words(words) |
|
collocations = finder.nbest(nltk.collocations.BigramAssocMeasures().pmi, 5) |
|
return "\n".join([f"{w1} - {w2}" for w1, w2 in collocations]) |
|
|
|
elif function == "WordNet Synsets": |
|
words = word_tokenize(input_text) |
|
first_word = words[0] if words else "" |
|
synsets = wordnet.synsets(first_word) |
|
if synsets: |
|
return f"Definition: {synsets[0].definition()}\nExamples: {synsets[0].examples()}" |
|
return "Không tìm thấy từ trong WordNet!" |
|
|
|
elif function == "Sample from Brown Corpus": |
|
return " ".join(brown.words()[:50]) |
|
|
|
return "Chức năng chưa được triển khai!" |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown( |
|
""" |
|
# Công cụ xử lý ngôn ngữ tự nhiên với NLTK |
|
Nhập văn bản và chọn chức năng để khám phá các khả năng của NLTK! |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_text = gr.Textbox( |
|
label="Nhập văn bản", |
|
placeholder="Ví dụ: I love coding in Python.", |
|
lines=5 |
|
) |
|
function_dropdown = gr.Dropdown( |
|
label="Chọn chức năng", |
|
choices=[ |
|
"Sentence Tokenization", |
|
"Word Tokenization", |
|
"Part-of-Speech Tagging", |
|
"Stemming", |
|
"Lemmatization", |
|
"Remove Stop Words", |
|
"Named Entity Recognition", |
|
"Text Classification", |
|
"N-grams (Bigrams)", |
|
"Collocations", |
|
"WordNet Synsets", |
|
"Sample from Brown Corpus" |
|
], |
|
value="Sentence Tokenization" |
|
) |
|
submit_btn = gr.Button("Xử lý", variant="primary") |
|
|
|
with gr.Column(scale=2): |
|
output_text = gr.Textbox( |
|
label="Kết quả", |
|
lines=10, |
|
interactive=False |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=nlp_tool, |
|
inputs=[input_text, function_dropdown], |
|
outputs=output_text |
|
) |
|
|
|
|
|
demo.launch() |