rapacious commited on
Commit
7701918
·
verified ·
1 Parent(s): 9abb0fc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import nltk
3
+ from nltk.tokenize import sent_tokenize, word_tokenize
4
+ from nltk.corpus import stopwords, wordnet, brown
5
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
6
+ from nltk import pos_tag, ne_chunk, ngrams
7
+ from nltk.collocations import BigramCollocationFinder
8
+ from nltk.classify import NaiveBayesClassifier
9
+ from nltk.corpus import movie_reviews
10
+ import random
11
+
12
+ # Tải các tài nguyên cần thiết
13
+ nltk.download('punkt')
14
+ nltk.download('punkt_tab')
15
+ nltk.download('averaged_perceptron_tagger')
16
+ nltk.download('maxent_ne_chunker')
17
+ nltk.download('words')
18
+ nltk.download('stopwords')
19
+ nltk.download('wordnet')
20
+ nltk.download('brown')
21
+ nltk.download('movie_reviews')
22
+
23
+ # Khởi tạo các công cụ
24
+ stemmer = PorterStemmer()
25
+ lemmatizer = WordNetLemmatizer()
26
+ stop_words = set(stopwords.words('english'))
27
+
28
+ # Hàm huấn luyện classifier đơn giản
29
+ def train_classifier():
30
+ pos_reviews = [({"word": word}, 'positive') for word in movie_reviews.words('pos')[:100]]
31
+ neg_reviews = [({"word": word}, 'negative') for word in movie_reviews.words('neg')[:100]]
32
+ train_set = pos_reviews + neg_reviews
33
+ random.shuffle(train_set)
34
+ return NaiveBayesClassifier.train(train_set)
35
+
36
+ classifier = train_classifier()
37
+
38
+ # Hàm chính xử lý các chức năng
39
+ def nlp_tool(input_text, function):
40
+ if not input_text:
41
+ return "Vui lòng nhập văn bản!"
42
+
43
+ if function == "Sentence Tokenization":
44
+ return "\n".join(sent_tokenize(input_text))
45
+
46
+ elif function == "Word Tokenization":
47
+ return "\n".join(word_tokenize(input_text))
48
+
49
+ elif function == "Part-of-Speech Tagging":
50
+ words = word_tokenize(input_text)
51
+ return "\n".join([f"{word}: {tag}" for word, tag in pos_tag(words)])
52
+
53
+ elif function == "Stemming":
54
+ words = word_tokenize(input_text)
55
+ return "\n".join([stemmer.stem(word) for word in words])
56
+
57
+ elif function == "Lemmatization":
58
+ words = word_tokenize(input_text)
59
+ return "\n".join([lemmatizer.lemmatize(word) for word in words])
60
+
61
+ elif function == "Remove Stop Words":
62
+ words = word_tokenize(input_text)
63
+ return "\n".join([word for word in words if word.lower() not in stop_words])
64
+
65
+ elif function == "Named Entity Recognition":
66
+ words = word_tokenize(input_text)
67
+ pos_tags = pos_tag(words)
68
+ entities = ne_chunk(pos_tags)
69
+ return str(entities)
70
+
71
+ elif function == "Text Classification":
72
+ words = word_tokenize(input_text)
73
+ result = classifier.classify({word: True for word in words})
74
+ return f"Sentiment: {result}"
75
+
76
+ elif function == "N-grams (Bigrams)":
77
+ words = word_tokenize(input_text)
78
+ bigrams = list(ngrams(words, 2))
79
+ return "\n".join([f"{w1} - {w2}" for w1, w2 in bigrams])
80
+
81
+ elif function == "Collocations":
82
+ words = word_tokenize(input_text)
83
+ finder = BigramCollocationFinder.from_words(words)
84
+ collocations = finder.nbest(nltk.collocations.BigramAssocMeasures().pmi, 5)
85
+ return "\n".join([f"{w1} - {w2}" for w1, w2 in collocations])
86
+
87
+ elif function == "WordNet Synsets":
88
+ words = word_tokenize(input_text)
89
+ first_word = words[0] if words else ""
90
+ synsets = wordnet.synsets(first_word)
91
+ if synsets:
92
+ return f"Definition: {synsets[0].definition()}\nExamples: {synsets[0].examples()}"
93
+ return "Không tìm thấy từ trong WordNet!"
94
+
95
+ elif function == "Sample from Brown Corpus":
96
+ return " ".join(brown.words()[:50])
97
+
98
+ return "Chức năng chưa được triển khai!"
99
+
100
+ # Tạo giao diện Gradio
101
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
102
+ gr.Markdown(
103
+ """
104
+ # Công cụ xử lý ngôn ngữ tự nhiên với NLTK
105
+ Nhập văn bản và chọn chức năng để khám phá các khả năng của NLTK!
106
+ """
107
+ )
108
+
109
+ with gr.Row():
110
+ with gr.Column(scale=1):
111
+ input_text = gr.Textbox(
112
+ label="Nhập văn bản",
113
+ placeholder="Ví dụ: I love coding in Python.",
114
+ lines=5
115
+ )
116
+ function_dropdown = gr.Dropdown(
117
+ label="Chọn chức năng",
118
+ choices=[
119
+ "Sentence Tokenization",
120
+ "Word Tokenization",
121
+ "Part-of-Speech Tagging",
122
+ "Stemming",
123
+ "Lemmatization",
124
+ "Remove Stop Words",
125
+ "Named Entity Recognition",
126
+ "Text Classification",
127
+ "N-grams (Bigrams)",
128
+ "Collocations",
129
+ "WordNet Synsets",
130
+ "Sample from Brown Corpus"
131
+ ],
132
+ value="Sentence Tokenization"
133
+ )
134
+ submit_btn = gr.Button("Xử lý", variant="primary")
135
+
136
+ with gr.Column(scale=2):
137
+ output_text = gr.Textbox(
138
+ label="Kết quả",
139
+ lines=10,
140
+ interactive=False
141
+ )
142
+
143
+ # Kết nối nút bấm với hàm xử lý
144
+ submit_btn.click(
145
+ fn=nlp_tool,
146
+ inputs=[input_text, function_dropdown],
147
+ outputs=output_text
148
+ )
149
+
150
+ # Chạy giao diện
151
+ demo.launch()