parkerjj commited on
Commit
3fdaf75
·
1 Parent(s): fcfffd7

重构 preprocess.py,增加两个情感分析模型的支持,优化情感得分计算逻辑,增强错误处理和日志打印

Browse files
Files changed (1) hide show
  1. preprocess.py +87 -18
preprocess.py CHANGED
@@ -1,6 +1,9 @@
1
  import re
2
  import sys
3
  import os
 
 
 
4
  import numpy as np
5
  from collections import defaultdict
6
  import pandas as pd
@@ -19,7 +22,7 @@ import akshare as ak
19
 
20
  from gensim.models import Word2Vec
21
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
22
-
23
 
24
 
25
 
@@ -47,13 +50,12 @@ print("Is NPL GPU used Preprocessing.py:", spacy.prefer_gpu())
47
 
48
 
49
  # 使用合适的模型和tokenizer
50
- model_name = "ProsusAI/finbert" # 选择合适的预训练模型
51
- tokenizer = AutoTokenizer.from_pretrained(model_name)
52
- sa_model = AutoModelForSequenceClassification.from_pretrained(model_name)
53
 
54
- # 初始化情感分析器
55
- sentiment_analyzer = pipeline('sentiment-analysis', model=sa_model, tokenizer=tokenizer)
56
 
 
 
57
 
58
 
59
 
@@ -177,16 +179,28 @@ def process_entities(entities):
177
  def process_pos_tags(pos_tags):
178
  pos_counts = defaultdict(int)
179
  try:
 
 
 
 
 
 
180
  for pos in pos_tags:
181
- pos_counts[pos[1]] += 1 # 使用POS标签(如NN, VB等)
 
 
 
182
 
183
  # 将字典转化为有序的数组
184
  pos_types = sorted(pos_counts.keys())
 
 
 
 
185
  counts = np.array([pos_counts[pos] for pos in pos_types])
186
  except Exception as e:
187
- print(f"Error in process_pos_tags: {str(e)}")
188
- counts = np.zeros(len(pos_tags))
189
- pos_types = []
190
 
191
  return counts, pos_types
192
 
@@ -211,14 +225,69 @@ def get_document_vector(words, model = word2vec_model):
211
  # 函数:获取情感得分
212
  def get_sentiment_score(text):
213
  try:
214
- # 直接将原始文本传递给 sentiment_analyzer,它会自动处理 tokenization
215
- result = sentiment_analyzer(text, truncation=True, max_length=512)[0]
216
- score = result['score'] if result['label'] == 'positive' else -result['score']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  except Exception as e:
218
  print(f"Error in get_sentiment_score for text: {text[:50]}... Error: {str(e)}")
219
- score = 0.0
220
-
221
- return score
222
 
223
 
224
 
@@ -386,7 +455,7 @@ def lemmatized_entry(entry):
386
  nlp = spacy.load("en_core_web_md")
387
 
388
  # 检查是否使用 GPU
389
- print("Is NPL GPU used Lemmatized:", spacy.prefer_gpu())
390
 
391
 
392
 
@@ -521,7 +590,7 @@ def lemmatize_text(text):
521
  nlp = spacy.load("en_core_web_md")
522
 
523
  # 检查是否使用 GPU
524
- print("Is NPL GPU used Enchance_text.py:", spacy.prefer_gpu())
525
 
526
 
527
 
 
1
  import re
2
  import sys
3
  import os
4
+ import trace
5
+ import traceback
6
+ from typing import final
7
  import numpy as np
8
  from collections import defaultdict
9
  import pandas as pd
 
22
 
23
  from gensim.models import Word2Vec
24
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
25
+ from transformers import BertTokenizer, BertForSequenceClassification
26
 
27
 
28
 
 
50
 
51
 
52
  # 使用合适的模型和tokenizer
53
+ tokenizer_one = AutoTokenizer.from_pretrained("ProsusAI/finbert")
54
+ sa_model_one = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
 
55
 
 
 
56
 
57
+ tokenizer_two = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
58
+ sa_model_two = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
59
 
60
 
61
 
 
179
  def process_pos_tags(pos_tags):
180
  pos_counts = defaultdict(int)
181
  try:
182
+ # 确保 pos_tags 不为空且是有效的标记
183
+ if not pos_tags or not isinstance(pos_tags, (list, tuple)):
184
+ print(f"Invalid POS tags: {pos_tags}")
185
+ return np.zeros(1), []
186
+
187
+ # 安全地处理每个 POS 标记
188
  for pos in pos_tags:
189
+ if isinstance(pos, str) and pos: # 确保是非空字符串
190
+ pos_counts[pos] += 1
191
+ elif isinstance(pos, (list, tuple)) and len(pos) > 1: # 如果是元组/列表,取第二个元素
192
+ pos_counts[pos[1]] += 1
193
 
194
  # 将字典转化为有序的数组
195
  pos_types = sorted(pos_counts.keys())
196
+ if not pos_types: # 如果没有有效的类型,返回零向量
197
+ print(f"No valid POS tags found: {pos_tags}")
198
+ return np.zeros(1), []
199
+
200
  counts = np.array([pos_counts[pos] for pos in pos_types])
201
  except Exception as e:
202
+ print(f"Error in process_pos_tags: {str(e)} for POS tags: {pos_tags}")
203
+ return np.zeros(1), []
 
204
 
205
  return counts, pos_types
206
 
 
225
  # 函数:获取情感得分
226
  def get_sentiment_score(text):
227
  try:
228
+ import torch
229
+
230
+ # 获取第一个模型的结果 (ProsusAI/finbert)
231
+ # result_one = sentiment_analyzer_one(text, truncation=True, max_length=512)[0]
232
+ # 获取模型输出
233
+ with torch.no_grad():
234
+ outputs_one = sa_model_one(**tokenizer_one(text, return_tensors="pt", truncation=False))
235
+ predictions_one = torch.nn.functional.softmax(outputs_one.logits, dim=-1)
236
+
237
+
238
+ outputs_two = sa_model_two(**tokenizer_two(text, return_tensors="pt", truncation=False))
239
+ predictions_two = torch.nn.functional.softmax(outputs_two.logits, dim=-1)
240
+
241
+ # 获取所有标签的概率
242
+ scores_one = predictions_one[0].tolist()
243
+ scores_two = predictions_two[0].tolist()
244
+
245
+ # 获取标签映射
246
+ # labels_one = sa_model_one.config.id2label
247
+ # labels_two = sa_model_two.config.id2label
248
+
249
+ # 打印所有标签的概率
250
+ score_one_positive = scores_one[0]
251
+ score_one_negative = scores_one[1]
252
+ score_one_neutral = scores_one[2]
253
+
254
+
255
+ final_score_one = 0.0
256
+ final_score_one += score_one_positive
257
+ final_score_one -= score_one_negative
258
+ if score_one_positive > score_one_negative:
259
+ final_score_one += score_one_neutral
260
+ else:
261
+ final_score_one -= score_one_neutral
262
+
263
+ final_score_one = max(-1.0, min(1.0, final_score_one))
264
+
265
+ score_two_neutral = scores_two[0]
266
+ score_two_positive = scores_two[1]
267
+ score_two_negative = scores_two[2]
268
+
269
+ final_score_two = 0.0
270
+ final_score_two += score_two_positive
271
+ final_score_two -= score_two_negative
272
+ if score_two_positive > score_two_negative:
273
+ final_score_two += score_two_neutral
274
+ else:
275
+ final_score_two -= score_two_neutral
276
+
277
+
278
+ # 将两个模型的得分组合(加权平均)
279
+ final_score = np.average([final_score_one, final_score_two], weights=[0.3, 0.7])
280
+
281
+ # 确保最终得分在 [-1, 1] 范围内
282
+ final_score = np.clip(final_score, -1.0, 1.0)
283
+
284
+ return final_score
285
+
286
  except Exception as e:
287
  print(f"Error in get_sentiment_score for text: {text[:50]}... Error: {str(e)}")
288
+ traceback.print_exc()
289
+ return 0.0
290
+
291
 
292
 
293
 
 
455
  nlp = spacy.load("en_core_web_md")
456
 
457
  # 检查是否使用 GPU
458
+ # print("Is NPL GPU used Lemmatized:", spacy.prefer_gpu())
459
 
460
 
461
 
 
590
  nlp = spacy.load("en_core_web_md")
591
 
592
  # 检查是否使用 GPU
593
+ # print("Is NPL GPU used Enchance_text.py:", spacy.prefer_gpu())
594
 
595
 
596