Spaces:
Running
Running
aliasgerovs
commited on
Commit
•
67f4199
1
Parent(s):
c9c6240
Update app.py
Browse files
app.py
CHANGED
@@ -23,6 +23,8 @@ import nltk
|
|
23 |
from unidecode import unidecode
|
24 |
import time
|
25 |
from utils import cos_sim_torch, embed_text
|
|
|
|
|
26 |
|
27 |
nltk.download('punkt')
|
28 |
|
@@ -113,15 +115,37 @@ def plagiarism_check(
|
|
113 |
source_embeddings.append(None)
|
114 |
|
115 |
# Populate matching scores for scrapped pages
|
116 |
-
for i, soup in enumerate(soups):
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
print(f"Time for matching score: {time.perf_counter()-time1}")
|
127 |
time1 = time.perf_counter()
|
|
|
23 |
from unidecode import unidecode
|
24 |
import time
|
25 |
from utils import cos_sim_torch, embed_text
|
26 |
+
import multiprocessing
|
27 |
+
from functools import partial
|
28 |
|
29 |
nltk.download('punkt')
|
30 |
|
|
|
115 |
source_embeddings.append(None)
|
116 |
|
117 |
# Populate matching scores for scrapped pages
|
118 |
+
# for i, soup in enumerate(soups):
|
119 |
+
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
120 |
+
# if soup:
|
121 |
+
# page_content = soup.text
|
122 |
+
# for j, sent in enumerate(sentences):
|
123 |
+
# # score = matchingScore(sent, page_content)
|
124 |
+
# # score = matchingScoreWithTimeout(sent, page_content)
|
125 |
+
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
126 |
+
# ScoreArray[i][j] = score
|
127 |
+
|
128 |
+
def calculate_score(params):
|
129 |
+
i, sent, source_embedding = params
|
130 |
+
return cos_sim_torch(embed_text(sent), source_embedding)
|
131 |
+
|
132 |
+
|
133 |
+
def worker(soups, sentences, source_embeddings):
|
134 |
+
params_list = []
|
135 |
+
for i, soup in enumerate(soups):
|
136 |
+
if soup:
|
137 |
+
for j, sent in enumerate(sentences):
|
138 |
+
params_list.append((i, sent, source_embeddings[i]))
|
139 |
+
num_processes = multiprocessing.cpu_count()
|
140 |
+
pool = multiprocessing.Pool(processes=num_processes)
|
141 |
+
scores = pool.map(calculate_score, params_list)
|
142 |
+
pool.close()
|
143 |
+
pool.join()
|
144 |
+
for k, score in enumerate(scores):
|
145 |
+
i, j = divmod(k, len(sentences))
|
146 |
+
ScoreArray[i][j] = score
|
147 |
+
|
148 |
+
worker(soups, sentences, source_embeddings)
|
149 |
|
150 |
print(f"Time for matching score: {time.perf_counter()-time1}")
|
151 |
time1 = time.perf_counter()
|