ducdatit2002 commited on
Commit
d1e6bcf
ยท
verified ยท
1 Parent(s): 0033618

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -205
app.py DELETED
@@ -1,205 +0,0 @@
1
- # demo_phobert_gradio.py
2
- # -*- coding: utf-8 -*-
3
-
4
- import gradio as gr
5
- import torch
6
- import re
7
- import json
8
- import emoji
9
- import numpy as np
10
- from underthesea import word_tokenize
11
-
12
- from transformers import (
13
- AutoConfig,
14
- AutoTokenizer,
15
- AutoModelForSequenceClassification
16
- )
17
-
18
- ###############################################################################
19
- # TแบขI MAPPING EMOJI - COPY Y NGUYรŠN Tแปช FILE TRAIN
20
- ###############################################################################
21
- emoji_mapping = {
22
- "๐Ÿ˜€": "[joy]", "๐Ÿ˜ƒ": "[joy]", "๐Ÿ˜„": "[joy]", "๐Ÿ˜": "[joy]", "๐Ÿ˜†": "[joy]", "๐Ÿ˜…": "[joy]", "๐Ÿ˜‚": "[joy]", "๐Ÿคฃ": "[joy]",
23
- "๐Ÿ™‚": "[love]", "๐Ÿ™ƒ": "[love]", "๐Ÿ˜‰": "[love]", "๐Ÿ˜Š": "[love]", "๐Ÿ˜‡": "[love]", "๐Ÿฅฐ": "[love]", "๐Ÿ˜": "[love]",
24
- "๐Ÿคฉ": "[love]", "๐Ÿ˜˜": "[love]", "๐Ÿ˜—": "[love]", "โ˜บ": "[love]", "๐Ÿ˜š": "[love]", "๐Ÿ˜™": "[love]",
25
- "๐Ÿ˜‹": "[satisfaction]", "๐Ÿ˜›": "[satisfaction]", "๐Ÿ˜œ": "[satisfaction]", "๐Ÿคช": "[satisfaction]", "๐Ÿ˜": "[satisfaction]",
26
- "๐Ÿค‘": "[satisfaction]",
27
- "๐Ÿค": "[neutral]", "๐Ÿคจ": "[neutral]", "๐Ÿ˜": "[neutral]", "๐Ÿ˜‘": "[neutral]", "๐Ÿ˜ถ": "[neutral]",
28
- "๐Ÿ˜": "[sarcasm]",
29
- "๐Ÿ˜’": "[disappointment]", "๐Ÿ™„": "[disappointment]", "๐Ÿ˜ฌ": "[disappointment]",
30
- "๐Ÿ˜”": "[sadness]", "๐Ÿ˜ช": "[sadness]", "๐Ÿ˜ข": "[sadness]", "๐Ÿ˜ญ": "[sadness]", "๐Ÿ˜ฅ": "[sadness]", "๐Ÿ˜“": "[sadness]",
31
- "๐Ÿ˜ฉ": "[tiredness]", "๐Ÿ˜ซ": "[tiredness]", "๐Ÿฅฑ": "[tiredness]",
32
- "๐Ÿคค": "[discomfort]", "๐Ÿคข": "[discomfort]", "๐Ÿคฎ": "[discomfort]", "๐Ÿคง": "[discomfort]", "๐Ÿฅต": "[discomfort]",
33
- "๐Ÿฅถ": "[discomfort]", "๐Ÿฅด": "[discomfort]", "๐Ÿ˜ต": "[discomfort]", "๐Ÿคฏ": "[discomfort]",
34
- "๐Ÿ˜•": "[confused]", "๐Ÿ˜Ÿ": "[confused]", "๐Ÿ™": "[confused]", "โ˜น": "[confused]",
35
- "๐Ÿ˜ฎ": "[surprise]", "๐Ÿ˜ฏ": "[surprise]", "๐Ÿ˜ฒ": "[surprise]", "๐Ÿ˜ณ": "[surprise]", "๐Ÿฅบ": "[pleading]",
36
- "๐Ÿ˜ฆ": "[fear]", "๐Ÿ˜ง": "[fear]", "๐Ÿ˜จ": "[fear]", "๐Ÿ˜ฐ": "[fear]", "๐Ÿ˜ฑ": "[fear]",
37
- "๐Ÿ˜–": "[confusion]", "๐Ÿ˜ฃ": "[confusion]", "๐Ÿ˜ž": "[confusion]",
38
- "๐Ÿ˜ค": "[anger]", "๐Ÿ˜ก": "[anger]", "๐Ÿ˜ ": "[anger]", "๐Ÿคฌ": "[anger]", "๐Ÿ˜ˆ": "[mischievous]", "๐Ÿ‘ฟ": "[mischievous]"
39
- }
40
-
41
- ###############################################################################
42
- # Hร€M Xแปฌ Lร (COPY Tแปช FILE TRAIN)
43
- ###############################################################################
44
- def replace_emojis(sentence, emoji_mapping):
45
- processed_sentence = []
46
- for char in sentence:
47
- if char in emoji_mapping:
48
- processed_sentence.append(emoji_mapping[char])
49
- elif not emoji.is_emoji(char):
50
- processed_sentence.append(char)
51
- return ''.join(processed_sentence)
52
-
53
- def remove_profanity(sentence):
54
- profane_words = ["loz", "vloz", "vl", "dm", "ฤ‘m", "clgt", "dmm", "cc", "vc", "ฤ‘รน mรฉ", "vรฃi"]
55
- words = sentence.split()
56
- filtered = [w for w in words if w.lower() not in profane_words]
57
- return ' '.join(filtered)
58
-
59
- def remove_special_characters(sentence):
60
- return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
61
-
62
- def normalize_whitespace(sentence):
63
- return ' '.join(sentence.split())
64
-
65
- def remove_repeated_characters(sentence):
66
- return re.sub(r"(.)\1{2,}", r"\1", sentence)
67
-
68
- def replace_numbers(sentence):
69
- return re.sub(r"\d+", "[number]", sentence)
70
-
71
- def tokenize_underthesea(sentence):
72
- tokens = word_tokenize(sentence)
73
- return " ".join(tokens)
74
-
75
- # Nแบฟu cรณ abbreviations.json, bแบกn load. Nแบฟu khรดng thรฌ ฤ‘แปƒ rแป—ng.
76
- try:
77
- with open("abbreviations.json", "r", encoding="utf-8") as f:
78
- abbreviations = json.load(f)
79
- except:
80
- abbreviations = {}
81
-
82
- def preprocess_sentence(sentence):
83
- # hแบก thแบฅp
84
- sentence = sentence.lower()
85
- # thay thแบฟ emoji
86
- sentence = replace_emojis(sentence, emoji_mapping)
87
- # loแบกi bแป tแปซ nhแบกy cแบฃm
88
- sentence = remove_profanity(sentence)
89
- # bแป kรฝ tแปฑ ฤ‘แบทc biแป‡t
90
- sentence = remove_special_characters(sentence)
91
- # chuแบฉn hoรก khoแบฃng trแบฏng
92
- sentence = normalize_whitespace(sentence)
93
- # thay thแบฟ viแบฟt tแบฏt
94
- words = sentence.split()
95
- replaced = []
96
- for w in words:
97
- if w in abbreviations:
98
- replaced.append(" ".join(abbreviations[w]))
99
- else:
100
- replaced.append(w)
101
- sentence = " ".join(replaced)
102
- # bแป bแป›t kรญ tแปฑ lแบทp
103
- sentence = remove_repeated_characters(sentence)
104
- # thay sแป‘ thร nh [number]
105
- sentence = replace_numbers(sentence)
106
- # tokenize tiแบฟng Viแป‡t
107
- sentence = tokenize_underthesea(sentence)
108
- return sentence
109
-
110
- ###############################################################################
111
- # LOAD CHECKPOINT
112
- ###############################################################################
113
- checkpoint_dir = "./checkpoint" # Folder checkpoint nแบฑm trong cรนng thฦฐ mแปฅc vแป›i file script
114
- device = "cuda" if torch.cuda.is_available() else "cpu"
115
-
116
- print("Loading config...")
117
- config = AutoConfig.from_pretrained(checkpoint_dir)
118
-
119
- # Mapping id to label theo thแปฉ tแปฑ bแบกn cung cแบฅp
120
- custom_id2label = {
121
- 0: 'Anger',
122
- 1: 'Disgust',
123
- 2: 'Enjoyment',
124
- 3: 'Fear',
125
- 4: 'Other',
126
- 5: 'Sadness',
127
- 6: 'Surprise'
128
- }
129
-
130
- # Kiแปƒm tra vร  sแปญ dแปฅng custom_id2label nแบฟu config.id2label khรดng ฤ‘รบng
131
- if hasattr(config, "id2label") and config.id2label:
132
- # Nแบฟu config.id2label chแปฉa 'LABEL_x', sแปญ dแปฅng custom mapping
133
- if all(label.startswith("LABEL_") for label in config.id2label.values()):
134
- id2label = custom_id2label
135
- else:
136
- id2label = {int(k): v for k, v in config.id2label.items()}
137
- else:
138
- id2label = custom_id2label # Sแปญ dแปฅng mapping mแบทc ฤ‘แป‹nh nแบฟu config khรดng cรณ id2label
139
-
140
- print("id2label loaded:", id2label)
141
-
142
- print("Loading tokenizer...")
143
- tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
144
-
145
- print("Loading model...")
146
- model = AutoModelForSequenceClassification.from_pretrained(checkpoint_dir, config=config)
147
- model.to(device)
148
- model.eval()
149
-
150
- ###############################################################################
151
- # Hร€M PREDICT
152
- ###############################################################################
153
- # Mapping tแปซ label ฤ‘แบฟn thรดng ฤ‘iแป‡p tฦฐฦกng แปฉng
154
- label2message = {
155
- 'Anger': 'Hรฃy bรฌnh tฤฉnh vร  giแบฃi quyแบฟt vแบฅn ฤ‘แป mแป™t cรกch bรฌnh thแบฃn.',
156
- 'Disgust': 'Hรฃy trรกnh xa nhแปฏng thแปฉ khiแบฟn bแบกn khรดng thรญch.',
157
- 'Enjoyment': 'Chรบc mแปซng bแบกn cรณ mแป™t ngร y tuyแป‡t vแปi!',
158
- 'Fear': 'Hรฃy ฤ‘แป‘i mแบทt vแป›i nแป—i sแปฃ ฤ‘แปƒ vฦฐแปฃt qua chรบng.',
159
- 'Other': 'Cแบฃm xรบc cแปงa bแบกn hiแป‡n tแบกi khรดng ฤ‘ฦฐแปฃc phรขn loแบกi rรต rร ng.',
160
- 'Sadness': 'Hรฃy tรฌm kiแบฟm sแปฑ hแป— trแปฃ khi cแบงn thiแบฟt.',
161
- 'Surprise': 'Thแบญt bแบฅt ngแป! Hรฃy tแบญn hฦฐแปŸng khoแบฃnh khแบฏc nร y.'
162
- }
163
-
164
- def predict_text(text: str) -> str:
165
- """Tiแปn xแปญ lรฝ, token hoรก vร  chแบกy model => trแบฃ vแป label vร  thรดng ฤ‘iแป‡p."""
166
- text_proc = preprocess_sentence(text)
167
- inputs = tokenizer(
168
- [text_proc],
169
- padding=True,
170
- truncation=True,
171
- max_length=256,
172
- return_tensors="pt"
173
- ).to(device)
174
-
175
- with torch.no_grad():
176
- outputs = model(**inputs)
177
- pred_id = outputs.logits.argmax(dim=-1).item()
178
-
179
- if pred_id in id2label:
180
- label = id2label[pred_id]
181
- message = label2message.get(label, "")
182
- if message:
183
- return f"Dแปฑ ฤ‘oรกn cแบฃm xรบc: {label}. {message}"
184
- else:
185
- return f"Dแปฑ ฤ‘oรกn cแบฃm xรบc: {label}."
186
- else:
187
- return f"Nhรฃn khรดng xรกc ฤ‘แป‹nh (id={pred_id})"
188
-
189
- ###############################################################################
190
- # GRADIO APP
191
- ###############################################################################
192
- def run_demo(input_text):
193
- predicted_emotion = predict_text(input_text)
194
- return predicted_emotion
195
-
196
- demo = gr.Interface(
197
- fn=run_demo,
198
- inputs=gr.Textbox(lines=3, label="Nhแบญp cรขu tiแบฟng Viแป‡t"),
199
- outputs=gr.Textbox(label="Kแบฟt quแบฃ"),
200
- title="PhoBERT Emotion Classification",
201
- description="Nhแบญp vร o 1 cรขu tiแบฟng Viแป‡t ฤ‘แปƒ dแปฑ ฤ‘oรกn cแบฃm xรบc."
202
- )
203
-
204
- if __name__ == "__main__":
205
- demo.launch(share=True)