aliasgerovs commited on
Commit
8ad69ed
1 Parent(s): ff03afa

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +2 -286
utils.py CHANGED
@@ -34,6 +34,7 @@ def remove_special_characters(text):
34
  text = remove_accents(text)
35
  pattern = r'[^\w\s\d.,!?\'"()-;]+'
36
  text = re.sub(pattern, "", text)
 
37
  return text
38
 
39
 
@@ -76,289 +77,4 @@ def extract_text_from_pdf(pdf_path):
76
 
77
 
78
  WORD = re.compile(r"\w+")
79
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
80
-
81
-
82
- # returns cosine similarity of two vectors
83
- # input: two vectors
84
- # output: integer between 0 and 1.
85
- # def get_cosine(vec1, vec2):
86
- # intersection = set(vec1.keys()) & set(vec2.keys())
87
-
88
- # # calculating numerator
89
- # numerator = sum([vec1[x] * vec2[x] for x in intersection])
90
-
91
- # # calculating denominator
92
- # sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
93
- # sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
94
- # denominator = math.sqrt(sum1) * math.sqrt(sum2)
95
-
96
- # # checking for divide by zero
97
- # if denominator == 0:
98
- # return 0.0
99
- # else:
100
- # return float(numerator) / denominator
101
-
102
-
103
- # # converts given text into a vector
104
- # def text_to_vector(text):
105
- # # uses the Regular expression above and gets all words
106
- # words = WORD.findall(text)
107
- # # returns a counter of all the words (count of number of occurences)
108
- # return Counter(words)
109
-
110
-
111
- # # returns cosine similarity of two words
112
- # # uses: text_to_vector(text) and get_cosine(v1,v2)
113
- # def cosineSim(text1, text2):
114
- # vector1 = text_to_vector(text1)
115
- # vector2 = text_to_vector(text2)
116
- # # print vector1,vector2
117
- # cosine = get_cosine(vector1, vector2)
118
- # return cosine
119
-
120
-
121
- # def cos_sim_torch(embedding_1, embedding_2):
122
- # return util.pytorch_cos_sim(embedding_1, embedding_2).item()
123
-
124
-
125
- # def embed_text(text):
126
- # return model.encode(text, convert_to_tensor=True)
127
-
128
-
129
- # def sentence_similarity(text1, text2):
130
- # embedding_1 = model.encode(text1, convert_to_tensor=True)
131
- # embedding_2 = model.encode(text2, convert_to_tensor=True)
132
-
133
- # o = util.pytorch_cos_sim(embedding_1, embedding_2)
134
- # return o.item()
135
-
136
-
137
- # def get_soup_requests(url):
138
- # page = requests.get(url)
139
- # if page.status_code == 200:
140
- # soup = BeautifulSoup(page.content, "html.parser")
141
- # return soup
142
- # print("HTML soup failed")
143
- # return None
144
-
145
-
146
- # def get_soup_httpx(url):
147
- # client = httpx.Client(timeout=30)
148
- # try:
149
- # page = client.get(url)
150
- # if page.status_code == httpx.codes.OK:
151
- # soup = BeautifulSoup(page.content, "html.parser")
152
- # return soup
153
- # except:
154
- # print("HTTPx soup failed")
155
- # return None
156
-
157
-
158
- # def getSentences(text):
159
- # from nltk.tokenize import sent_tokenize
160
-
161
- # sents = sent_tokenize(text)
162
- # two_sents = []
163
- # for i in range(len(sents)):
164
- # if (i % 2) == 0:
165
- # two_sents.append(sents[i])
166
- # else:
167
- # two_sents[len(two_sents) - 1] += " " + sents[i]
168
- # return two_sents
169
-
170
-
171
- # def googleSearch(
172
- # plag_option,
173
- # sentences,
174
- # urlCount,
175
- # scoreArray,
176
- # urlList,
177
- # sorted_date,
178
- # domains_to_skip,
179
- # api_key,
180
- # cse_id,
181
- # **kwargs,
182
- # ):
183
- # service = build("customsearch", "v1", developerKey=api_key)
184
- # for i, sentence in enumerate(sentences):
185
- # results = (
186
- # service.cse()
187
- # .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
188
- # .execute()
189
- # )
190
- # if "items" in results and len(results["items"]) > 0:
191
- # for count, link in enumerate(results["items"]):
192
- # # stop after 3 pages
193
- # if count >= 3:
194
- # break
195
- # # skip user selected domains
196
- # if any(
197
- # ("." + domain) in link["link"] for domain in domains_to_skip
198
- # ):
199
- # continue
200
- # # clean up snippet of '...'
201
- # snippet = link["snippet"]
202
- # ind = snippet.find("...")
203
- # if ind < 20 and ind > 9:
204
- # snippet = snippet[ind + len("... ") :]
205
- # ind = snippet.find("...")
206
- # if ind > len(snippet) - 5:
207
- # snippet = snippet[:ind]
208
-
209
- # # update cosine similarity between snippet and given text
210
- # url = link["link"]
211
- # if url not in urlList:
212
- # urlList.append(url)
213
- # scoreArray.append([0] * len(sentences))
214
- # urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
215
- # if plag_option == "Standard":
216
- # scoreArray[urlList.index(url)][i] = cosineSim(
217
- # sentence, snippet
218
- # )
219
- # else:
220
- # scoreArray[urlList.index(url)][i] = sentence_similarity(
221
- # sentence, snippet
222
- # )
223
- # else:
224
- # print("Google Search failed")
225
- # return urlCount, scoreArray
226
-
227
-
228
- # def getQueries(text, n):
229
- # # return n-grams of size n
230
- # words = text.split()
231
- # return [words[i : i + n] for i in range(len(words) - n + 1)]
232
-
233
-
234
- # def print2D(array):
235
- # print(np.array(array))
236
-
237
-
238
- # def removePunc(text):
239
- # res = re.sub(r"[^\w\s]", "", text)
240
- # return res
241
-
242
-
243
- # async def get_url_data(url, client):
244
- # try:
245
- # r = await client.get(url)
246
- # # print(r.status_code)
247
- # if r.status_code == 200:
248
- # # print("in")
249
- # soup = BeautifulSoup(r.content, "html.parser")
250
- # return soup
251
- # except Exception:
252
- # print("HTTPx parallel soup failed")
253
- # return None
254
-
255
-
256
- # async def parallel_scrap(urls):
257
- # async with httpx.AsyncClient(timeout=30) as client:
258
- # tasks = []
259
- # for url in urls:
260
- # tasks.append(get_url_data(url=url, client=client))
261
- # results = await asyncio.gather(*tasks, return_exceptions=True)
262
- # return results
263
-
264
-
265
- # class TimeoutError(Exception):
266
- # pass
267
-
268
-
269
- # def matchingScore(sentence, content):
270
- # if sentence in content:
271
- # return 1
272
- # sentence = removePunc(sentence)
273
- # content = removePunc(content)
274
- # if sentence in content:
275
- # return 1
276
- # else:
277
- # n = 5
278
- # ngrams = getQueries(sentence, n)
279
- # if len(ngrams) == 0:
280
- # return 0
281
- # matched = [x for x in ngrams if " ".join(x) in content]
282
- # return len(matched) / len(ngrams)
283
-
284
-
285
- # # def matchingScoreWithTimeout(sentence, content):
286
- # # def timeout_handler():
287
- # # raise TimeoutError("Function timed out")
288
-
289
- # # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
290
- # # timer.start()
291
- # # try:
292
- # # score = sentence_similarity(sentence, content)
293
- # # # score = matchingScore(sentence, content)
294
- # # timer.cancel() # Cancel the timer if calculation completes before timeout
295
- # # return score
296
- # # except TimeoutError:
297
- # # return 0
298
-
299
-
300
- # # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
301
- # # content = removePunc(content)
302
- # # for j, sentence in enumerate(sentences):
303
- # # sentence = removePunc(sentence)
304
- # # if sentence in content:
305
- # # ScoreArray[content_idx][j] = 1
306
- # # else:
307
- # # n = 5
308
- # # ngrams = getQueries(sentence, n)
309
- # # if len(ngrams) == 0:
310
- # # return 0
311
- # # matched = [x for x in ngrams if " ".join(x) in content]
312
- # # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
313
- # # print(
314
- # # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
315
- # # )
316
- # # return ScoreArray
317
-
318
-
319
- # async def matchingScoreAsync(
320
- # sentences, content, content_idx, ScoreArray, model, util
321
- # ):
322
- # content = removePunc(content)
323
- # for j, sentence in enumerate(sentences):
324
- # sentence = removePunc(sentence)
325
- # similarity_score = sentence_similarity(sentence, content, model, util)
326
- # ScoreArray[content_idx][j] = similarity_score
327
- # print(
328
- # f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
329
- # )
330
- # return ScoreArray
331
-
332
-
333
- # async def parallel_analyze(soups, sentences, ScoreArray):
334
- # tasks = []
335
- # for i, soup in enumerate(soups):
336
- # if soup:
337
- # page_content = soup.text
338
- # tasks.append(
339
- # matchingScoreAsync(sentences, page_content, i, ScoreArray)
340
- # )
341
- # else:
342
- # print(
343
- # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
344
- # )
345
- # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
346
- # return ScoreArray
347
-
348
-
349
- # async def parallel_analyze_2(soups, sentences, ScoreArray):
350
- # tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
351
- # for i, soup in enumerate(soups):
352
- # if soup:
353
- # page_content = soup.text
354
- # for j, sent in enumerate(sentences):
355
- # print(
356
- # f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
357
- # )
358
- # tasks[i][j] = sentence_similarity(sent, page_content)
359
- # else:
360
- # print(
361
- # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
362
- # )
363
- # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
364
- # return ScoreArray
 
34
  text = remove_accents(text)
35
  pattern = r'[^\w\s\d.,!?\'"()-;]+'
36
  text = re.sub(pattern, "", text)
37
+ text = text.replace("<s>", "").replace("</s>", "")
38
  return text
39
 
40
 
 
77
 
78
 
79
  WORD = re.compile(r"\w+")
80
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")