NLTK / app.py
rapacious's picture
Create app.py
7701918 verified
raw
history blame
5.31 kB
import gradio as gr
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet, brown
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk, ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import random
# Tải các tài nguyên cần thiết
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('movie_reviews')
# Khởi tạo các công cụ
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Hàm huấn luyện classifier đơn giản
def train_classifier():
pos_reviews = [({"word": word}, 'positive') for word in movie_reviews.words('pos')[:100]]
neg_reviews = [({"word": word}, 'negative') for word in movie_reviews.words('neg')[:100]]
train_set = pos_reviews + neg_reviews
random.shuffle(train_set)
return NaiveBayesClassifier.train(train_set)
classifier = train_classifier()
# Hàm chính xử lý các chức năng
def nlp_tool(input_text, function):
if not input_text:
return "Vui lòng nhập văn bản!"
if function == "Sentence Tokenization":
return "\n".join(sent_tokenize(input_text))
elif function == "Word Tokenization":
return "\n".join(word_tokenize(input_text))
elif function == "Part-of-Speech Tagging":
words = word_tokenize(input_text)
return "\n".join([f"{word}: {tag}" for word, tag in pos_tag(words)])
elif function == "Stemming":
words = word_tokenize(input_text)
return "\n".join([stemmer.stem(word) for word in words])
elif function == "Lemmatization":
words = word_tokenize(input_text)
return "\n".join([lemmatizer.lemmatize(word) for word in words])
elif function == "Remove Stop Words":
words = word_tokenize(input_text)
return "\n".join([word for word in words if word.lower() not in stop_words])
elif function == "Named Entity Recognition":
words = word_tokenize(input_text)
pos_tags = pos_tag(words)
entities = ne_chunk(pos_tags)
return str(entities)
elif function == "Text Classification":
words = word_tokenize(input_text)
result = classifier.classify({word: True for word in words})
return f"Sentiment: {result}"
elif function == "N-grams (Bigrams)":
words = word_tokenize(input_text)
bigrams = list(ngrams(words, 2))
return "\n".join([f"{w1} - {w2}" for w1, w2 in bigrams])
elif function == "Collocations":
words = word_tokenize(input_text)
finder = BigramCollocationFinder.from_words(words)
collocations = finder.nbest(nltk.collocations.BigramAssocMeasures().pmi, 5)
return "\n".join([f"{w1} - {w2}" for w1, w2 in collocations])
elif function == "WordNet Synsets":
words = word_tokenize(input_text)
first_word = words[0] if words else ""
synsets = wordnet.synsets(first_word)
if synsets:
return f"Definition: {synsets[0].definition()}\nExamples: {synsets[0].examples()}"
return "Không tìm thấy từ trong WordNet!"
elif function == "Sample from Brown Corpus":
return " ".join(brown.words()[:50])
return "Chức năng chưa được triển khai!"
# Tạo giao diện Gradio
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# Công cụ xử lý ngôn ngữ tự nhiên với NLTK
Nhập văn bản và chọn chức năng để khám phá các khả năng của NLTK!
"""
)
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Nhập văn bản",
placeholder="Ví dụ: I love coding in Python.",
lines=5
)
function_dropdown = gr.Dropdown(
label="Chọn chức năng",
choices=[
"Sentence Tokenization",
"Word Tokenization",
"Part-of-Speech Tagging",
"Stemming",
"Lemmatization",
"Remove Stop Words",
"Named Entity Recognition",
"Text Classification",
"N-grams (Bigrams)",
"Collocations",
"WordNet Synsets",
"Sample from Brown Corpus"
],
value="Sentence Tokenization"
)
submit_btn = gr.Button("Xử lý", variant="primary")
with gr.Column(scale=2):
output_text = gr.Textbox(
label="Kết quả",
lines=10,
interactive=False
)
# Kết nối nút bấm với hàm xử lý
submit_btn.click(
fn=nlp_tool,
inputs=[input_text, function_dropdown],
outputs=output_text
)
# Chạy giao diện
demo.launch()