aliasgerovs commited on
Commit
67f4199
1 Parent(s): c9c6240

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -9
app.py CHANGED
@@ -23,6 +23,8 @@ import nltk
23
  from unidecode import unidecode
24
  import time
25
  from utils import cos_sim_torch, embed_text
 
 
26
 
27
  nltk.download('punkt')
28
 
@@ -113,15 +115,37 @@ def plagiarism_check(
113
  source_embeddings.append(None)
114
 
115
  # Populate matching scores for scrapped pages
116
- for i, soup in enumerate(soups):
117
- print(f"Analyzing {i+1} of {len(soups)} soups........................")
118
- if soup:
119
- page_content = soup.text
120
- for j, sent in enumerate(sentences):
121
- # score = matchingScore(sent, page_content)
122
- # score = matchingScoreWithTimeout(sent, page_content)
123
- score = cos_sim_torch(embed_text(sent), source_embeddings[i])
124
- ScoreArray[i][j] = score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  print(f"Time for matching score: {time.perf_counter()-time1}")
127
  time1 = time.perf_counter()
 
23
  from unidecode import unidecode
24
  import time
25
  from utils import cos_sim_torch, embed_text
26
+ import multiprocessing
27
+ from functools import partial
28
 
29
  nltk.download('punkt')
30
 
 
115
  source_embeddings.append(None)
116
 
117
  # Populate matching scores for scrapped pages
118
+ # for i, soup in enumerate(soups):
119
+ # print(f"Analyzing {i+1} of {len(soups)} soups........................")
120
+ # if soup:
121
+ # page_content = soup.text
122
+ # for j, sent in enumerate(sentences):
123
+ # # score = matchingScore(sent, page_content)
124
+ # # score = matchingScoreWithTimeout(sent, page_content)
125
+ # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
126
+ # ScoreArray[i][j] = score
127
+
128
+ def calculate_score(params):
129
+ i, sent, source_embedding = params
130
+ return cos_sim_torch(embed_text(sent), source_embedding)
131
+
132
+
133
+ def worker(soups, sentences, source_embeddings):
134
+ params_list = []
135
+ for i, soup in enumerate(soups):
136
+ if soup:
137
+ for j, sent in enumerate(sentences):
138
+ params_list.append((i, sent, source_embeddings[i]))
139
+ num_processes = multiprocessing.cpu_count()
140
+ pool = multiprocessing.Pool(processes=num_processes)
141
+ scores = pool.map(calculate_score, params_list)
142
+ pool.close()
143
+ pool.join()
144
+ for k, score in enumerate(scores):
145
+ i, j = divmod(k, len(sentences))
146
+ ScoreArray[i][j] = score
147
+
148
+ worker(soups, sentences, source_embeddings)
149
 
150
  print(f"Time for matching score: {time.perf_counter()-time1}")
151
  time1 = time.perf_counter()