Spaces:

rapacious
/

NLTK

Running

App Files Files Community

NLTK / app.py

rapacious

Update app.py

486d40d verified 4 months ago

raw

history blame contribute delete

9.53 kB

	import gradio as gr
	import nltk
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords, wordnet, brown, movie_reviews
	from nltk.stem import PorterStemmer, WordNetLemmatizer
	from nltk import pos_tag, ne_chunk, ngrams
	from nltk.collocations import BigramCollocationFinder
	from nltk.classify import NaiveBayesClassifier
	import random

	# Tải các tài nguyên cần thiết
	nltk.download('punkt')
	nltk.download('punkt_tab')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('averaged_perceptron_tagger_eng') # Thêm dòng này
	nltk.download('maxent_ne_chunker')
	nltk.download('words')
	nltk.download('stopwords')
	nltk.download('wordnet')
	nltk.download('brown')
	nltk.download('movie_reviews')

	# Khởi tạo các công cụ
	stemmer = PorterStemmer()
	lemmatizer = WordNetLemmatizer()
	stop_words = set(stopwords.words('english'))

	# Hàm huấn luyện classifier
	def train_classifier():
	pos_files = movie_reviews.fileids('pos')[:50]
	neg_files = movie_reviews.fileids('neg')[:50]
	pos_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'positive') for fileid in pos_files]
	neg_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'negative') for fileid in neg_files]
	train_set = pos_reviews + neg_reviews
	random.shuffle(train_set)
	return NaiveBayesClassifier.train(train_set)

	classifier = train_classifier()

	# Hàm xử lý cho từng chức năng
	def process_text(input_text, function):
	if not input_text:
	return "Vui lòng nhập văn bản!"

	if function == "Sentence Tokenization":
	return "\n".join(sent_tokenize(input_text))
	elif function == "Word Tokenization":
	return "\n".join(word_tokenize(input_text))
	elif function == "Part-of-Speech Tagging":
	words = word_tokenize(input_text)
	return "\n".join([f"{word}: {tag}" for word, tag in pos_tag(words)])
	elif function == "Stemming":
	words = word_tokenize(input_text)
	return "\n".join([stemmer.stem(word) for word in words])
	elif function == "Lemmatization":
	words = word_tokenize(input_text)
	return "\n".join([lemmatizer.lemmatize(word) for word in words])
	elif function == "Remove Stop Words":
	words = word_tokenize(input_text)
	return "\n".join([word for word in words if word.lower() not in stop_words])
	elif function == "Named Entity Recognition":
	words = word_tokenize(input_text)
	pos_tags = pos_tag(words)
	entities = ne_chunk(pos_tags)
	return str(entities)
	elif function == "Text Classification":
	words = word_tokenize(input_text)
	result = classifier.classify({word: True for word in words})
	return f"Sentiment: {result}"
	elif function == "N-grams (Bigrams)":
	words = word_tokenize(input_text)
	bigrams = list(ngrams(words, 2))
	return "\n".join([f"{w1} - {w2}" for w1, w2 in bigrams])
	elif function == "Collocations":
	words = word_tokenize(input_text)
	finder = BigramCollocationFinder.from_words(words)
	collocations = finder.nbest(nltk.collocations.BigramAssocMeasures().pmi, 5)
	return "\n".join([f"{w1} - {w2}" for w1, w2 in collocations])
	elif function == "WordNet Synsets":
	words = word_tokenize(input_text)
	first_word = words[0] if words else ""
	synsets = wordnet.synsets(first_word)
	if synsets:
	return f"Definition: {synsets[0].definition()}\nExamples: {synsets[0].examples()}"
	return "Không tìm thấy từ trong WordNet!"
	elif function == "Sample from Brown Corpus":
	return " ".join(brown.words()[:50])
	return "Chức năng chưa được triển khai!"

	# Tạo giao diện Gradio với các tab
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# Công cụ xử lý ngôn ngữ tự nhiên với NLTK
	Chọn tab và nhập văn bản để khám phá các tính năng!
	"""
	)

	with gr.Tabs():
	# Tab 1: Tokenization
	with gr.TabItem("Tokenization"):
	with gr.Row():
	token_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: I love coding.", lines=5)
	token_dropdown = gr.Dropdown(
	label="Chọn chức năng",
	choices=["Sentence Tokenization", "Word Tokenization"],
	value="Sentence Tokenization"
	)
	token_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
	token_btn = gr.Button("Xử lý", variant="primary")
	gr.Markdown(
	"""
	### Hướng dẫn:
	- Sentence Tokenization: Tách văn bản thành các câu riêng biệt.
	- Word Tokenization: Tách văn bản thành các từ riêng lẻ.
	"""
	)
	token_btn.click(fn=process_text, inputs=[token_input, token_dropdown], outputs=token_output)

	# Tab 2: Morphology
	with gr.TabItem("Morphology"):
	with gr.Row():
	morph_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: Running is fun.", lines=5)
	morph_dropdown = gr.Dropdown(
	label="Chọn chức năng",
	choices=["Stemming", "Lemmatization", "Remove Stop Words"],
	value="Stemming"
	)
	morph_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
	morph_btn = gr.Button("Xử lý", variant="primary")
	gr.Markdown(
	"""
	### Hướng dẫn:
	- Stemming: Rút gọn từ về dạng gốc thô (VD: 'running' → 'run').
	- Lemmatization: Rút gọn từ về dạng gốc có nghĩa (VD: 'better' → 'good').
	- Remove Stop Words: Loại bỏ từ dừng như 'the', 'is' (chỉ hỗ trợ tiếng Anh).
	"""
	)
	morph_btn.click(fn=process_text, inputs=[morph_input, morph_dropdown], outputs=morph_output)

	# Tab 3: Syntax & Semantics
	with gr.TabItem("Syntax & Semantics"):
	with gr.Row():
	syntax_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: Apple is in California.", lines=5)
	syntax_dropdown = gr.Dropdown(
	label="Chọn chức năng",
	choices=["Part-of-Speech Tagging", "Named Entity Recognition", "WordNet Synsets"],
	value="Part-of-Speech Tagging"
	)
	syntax_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
	syntax_btn = gr.Button("Xử lý", variant="primary")
	gr.Markdown(
	"""
	### Hướng dẫn:
	- Part-of-Speech Tagging: Gắn nhãn từ loại (VD: danh từ, động từ).
	- Named Entity Recognition: Nhận diện thực thể (VD: tên người, địa điểm).
	- WordNet Synsets: Tra cứu định nghĩa và ví dụ từ WordNet (cho từ đầu tiên).
	"""
	)
	syntax_btn.click(fn=process_text, inputs=[syntax_input, syntax_dropdown], outputs=syntax_output)

	# Tab 4: Text Analysis
	with gr.TabItem("Text Analysis"):
	with gr.Row():
	analysis_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: I love this movie!", lines=5)
	analysis_dropdown = gr.Dropdown(
	label="Chọn chức năng",
	choices=["Text Classification", "N-grams (Bigrams)", "Collocations"],
	value="Text Classification"
	)
	analysis_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
	analysis_btn = gr.Button("Xử lý", variant="primary")
	gr.Markdown(
	"""
	### Hướng dẫn:
	- Text Classification: Phân loại cảm xúc (tích cực/tiêu cực).
	- N-grams (Bigrams): Tạo các cặp từ liên tiếp.
	- Collocations: Tìm các cặp từ thường xuất hiện cùng nhau.
	"""
	)
	analysis_btn.click(fn=process_text, inputs=[analysis_input, analysis_dropdown], outputs=analysis_output)

	# Tab 5: Corpus
	with gr.TabItem("Corpus"):
	with gr.Row():
	corpus_input = gr.Textbox(label="Nhập văn bản (không cần thiết)", placeholder="Để trống cũng được", lines=5)
	corpus_dropdown = gr.Dropdown(
	label="Chọn chức năng",
	choices=["Sample from Brown Corpus"],
	value="Sample from Brown Corpus"
	)
	corpus_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
	corpus_btn = gr.Button("Xử lý", variant="primary")
	gr.Markdown(
	"""
	### Hướng dẫn:
	- Sample from Brown Corpus: Lấy mẫu 50 từ từ Brown Corpus bất kể văn bản nhập vào.
	"""
	)
	corpus_btn.click(fn=process_text, inputs=[corpus_input, corpus_dropdown], outputs=corpus_output)

	# Chạy giao diện
	demo.launch()