Shchushch commited on
Commit
50872cb
·
1 Parent(s): e2a5546
Files changed (1) hide show
  1. find.py +5 -1
find.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  from transformers import AutoTokenizer, AutoModel,BertTokenizer,BertModel
4
  import numpy as np
5
  import pickle
6
- # import sklearn
7
  from nltk.stem import WordNetLemmatizer
8
  from nltk.tag import pos_tag
9
  from nltk.corpus import stopwords
@@ -14,9 +14,13 @@ import faiss
14
  from tqdm import tqdm
15
  tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
16
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
 
 
 
17
  eng_stop_words = stopwords.words('english')
18
  with open('russian.txt', 'r') as f:
19
  ru_stop_words = f.read()
 
20
  ru_stop_words=ru_stop_words.split('\n')
21
  allow="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя0123456789-' \n\t"
22
  #Задаём стеммер
 
3
  from transformers import AutoTokenizer, AutoModel,BertTokenizer,BertModel
4
  import numpy as np
5
  import pickle
6
+ import nltk
7
  from nltk.stem import WordNetLemmatizer
8
  from nltk.tag import pos_tag
9
  from nltk.corpus import stopwords
 
14
  from tqdm import tqdm
15
  tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
16
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
17
+
18
+ nltk.download('stopwords')
19
+
20
  eng_stop_words = stopwords.words('english')
21
  with open('russian.txt', 'r') as f:
22
  ru_stop_words = f.read()
23
+
24
  ru_stop_words=ru_stop_words.split('\n')
25
  allow="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя0123456789-' \n\t"
26
  #Задаём стеммер