Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

Ali Asgarov commited on Mar 27, 2024

Commit

8ad69ed

1 Parent(s): ff03afa

Update utils.py

Browse files

Files changed (1) hide show

utils.py +2 -286

utils.py CHANGED Viewed

@@ -34,6 +34,7 @@ def remove_special_characters(text):
     text = remove_accents(text)
     pattern = r'[^\w\s\d.,!?\'"()-;]+'
     text = re.sub(pattern, "", text)
     return text
@@ -76,289 +77,4 @@ def extract_text_from_pdf(pdf_path):
 WORD = re.compile(r"\w+")
-model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# returns cosine similarity of two vectors
-# input: two vectors
-# output: integer between 0 and 1.
-# def get_cosine(vec1, vec2):
-#     intersection = set(vec1.keys()) & set(vec2.keys())
-#     # calculating numerator
-#     numerator = sum([vec1[x] * vec2[x] for x in intersection])
-#     # calculating denominator
-#     sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
-#     sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
-#     denominator = math.sqrt(sum1) * math.sqrt(sum2)
-#     # checking for divide by zero
-#     if denominator == 0:
-#         return 0.0
-#     else:
-#         return float(numerator) / denominator
-# # converts given text into a vector
-# def text_to_vector(text):
-#     # uses the Regular expression above and gets all words
-#     words = WORD.findall(text)
-#     # returns a counter of all the words (count of number of occurences)
-#     return Counter(words)
-# # returns cosine similarity of two words
-# # uses: text_to_vector(text) and get_cosine(v1,v2)
-# def cosineSim(text1, text2):
-#     vector1 = text_to_vector(text1)
-#     vector2 = text_to_vector(text2)
-#     # print vector1,vector2
-#     cosine = get_cosine(vector1, vector2)
-#     return cosine
-# def cos_sim_torch(embedding_1, embedding_2):
-#     return util.pytorch_cos_sim(embedding_1, embedding_2).item()
-# def embed_text(text):
-#     return model.encode(text, convert_to_tensor=True)
-# def sentence_similarity(text1, text2):
-#     embedding_1 = model.encode(text1, convert_to_tensor=True)
-#     embedding_2 = model.encode(text2, convert_to_tensor=True)
-#     o = util.pytorch_cos_sim(embedding_1, embedding_2)
-#     return o.item()
-# def get_soup_requests(url):
-#     page = requests.get(url)
-#     if page.status_code == 200:
-#         soup = BeautifulSoup(page.content, "html.parser")
-#         return soup
-#     print("HTML soup failed")
-#     return None
-# def get_soup_httpx(url):
-#     client = httpx.Client(timeout=30)
-#     try:
-#         page = client.get(url)
-#         if page.status_code == httpx.codes.OK:
-#             soup = BeautifulSoup(page.content, "html.parser")
-#             return soup
-#     except:
-#         print("HTTPx soup failed")
-#         return None
-# def getSentences(text):
-#     from nltk.tokenize import sent_tokenize
-#     sents = sent_tokenize(text)
-#     two_sents = []
-#     for i in range(len(sents)):
-#         if (i % 2) == 0:
-#             two_sents.append(sents[i])
-#         else:
-#             two_sents[len(two_sents) - 1] += " " + sents[i]
-#     return two_sents
-# def googleSearch(
-#     plag_option,
-#     sentences,
-#     urlCount,
-#     scoreArray,
-#     urlList,
-#     sorted_date,
-#     domains_to_skip,
-#     api_key,
-#     cse_id,
-#     **kwargs,
-# ):
-#     service = build("customsearch", "v1", developerKey=api_key)
-#     for i, sentence in enumerate(sentences):
-#         results = (
-#             service.cse()
-#             .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
-#             .execute()
-#         )
-#         if "items" in results and len(results["items"]) > 0:
-#             for count, link in enumerate(results["items"]):
-#                 # stop after 3 pages
-#                 if count >= 3:
-#                     break
-#                 # skip user selected domains
-#                 if any(
-#                     ("." + domain) in link["link"] for domain in domains_to_skip
-#                 ):
-#                     continue
-#                 # clean up snippet of '...'
-#                 snippet = link["snippet"]
-#                 ind = snippet.find("...")
-#                 if ind < 20 and ind > 9:
-#                     snippet = snippet[ind + len("... ") :]
-#                 ind = snippet.find("...")
-#                 if ind > len(snippet) - 5:
-#                     snippet = snippet[:ind]
-#                 # update cosine similarity between snippet and given text
-#                 url = link["link"]
-#                 if url not in urlList:
-#                     urlList.append(url)
-#                     scoreArray.append([0] * len(sentences))
-#                 urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
-#                 if plag_option == "Standard":
-#                     scoreArray[urlList.index(url)][i] = cosineSim(
-#                         sentence, snippet
-#                     )
-#                 else:
-#                     scoreArray[urlList.index(url)][i] = sentence_similarity(
-#                         sentence, snippet
-#                     )
-#         else:
-#             print("Google Search failed")
-#     return urlCount, scoreArray
-# def getQueries(text, n):
-#     # return n-grams of size n
-#     words = text.split()
-#     return [words[i : i + n] for i in range(len(words) - n + 1)]
-# def print2D(array):
-#     print(np.array(array))
-# def removePunc(text):
-#     res = re.sub(r"[^\w\s]", "", text)
-#     return res
-# async def get_url_data(url, client):
-#     try:
-#         r = await client.get(url)
-#         # print(r.status_code)
-#         if r.status_code == 200:
-#             # print("in")
-#             soup = BeautifulSoup(r.content, "html.parser")
-#             return soup
-#     except Exception:
-#         print("HTTPx parallel soup failed")
-#         return None
-# async def parallel_scrap(urls):
-#     async with httpx.AsyncClient(timeout=30) as client:
-#         tasks = []
-#         for url in urls:
-#             tasks.append(get_url_data(url=url, client=client))
-#         results = await asyncio.gather(*tasks, return_exceptions=True)
-#     return results
-# class TimeoutError(Exception):
-#     pass
-# def matchingScore(sentence, content):
-#     if sentence in content:
-#         return 1
-#     sentence = removePunc(sentence)
-#     content = removePunc(content)
-#     if sentence in content:
-#         return 1
-#     else:
-#         n = 5
-#         ngrams = getQueries(sentence, n)
-#         if len(ngrams) == 0:
-#             return 0
-#         matched = [x for x in ngrams if " ".join(x) in content]
-#     return len(matched) / len(ngrams)
-# # def matchingScoreWithTimeout(sentence, content):
-# #     def timeout_handler():
-# #         raise TimeoutError("Function timed out")
-# #     timer = threading.Timer(10, timeout_handler)  # Set a timer for 2 seconds
-# #     timer.start()
-# #     try:
-# #         score = sentence_similarity(sentence, content)
-# #         # score = matchingScore(sentence, content)
-# #         timer.cancel()  # Cancel the timer if calculation completes before timeout
-# #         return score
-# #     except TimeoutError:
-# #         return 0
-# # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
-# #     content = removePunc(content)
-# #     for j, sentence in enumerate(sentences):
-# #         sentence = removePunc(sentence)
-# #         if sentence in content:
-# #             ScoreArray[content_idx][j] = 1
-# #         else:
-# #             n = 5
-# #             ngrams = getQueries(sentence, n)
-# #             if len(ngrams) == 0:
-# #                 return 0
-# #             matched = [x for x in ngrams if " ".join(x) in content]
-# #             ScoreArray[content_idx][j] = len(matched) / len(ngrams)
-# #     print(
-# #         f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
-# #     )
-# #     return ScoreArray
-# async def matchingScoreAsync(
-#     sentences, content, content_idx, ScoreArray, model, util
-# ):
-#     content = removePunc(content)
-#     for j, sentence in enumerate(sentences):
-#         sentence = removePunc(sentence)
-#         similarity_score = sentence_similarity(sentence, content, model, util)
-#         ScoreArray[content_idx][j] = similarity_score
-#     print(
-#         f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
-#     )
-#     return ScoreArray
-# async def parallel_analyze(soups, sentences, ScoreArray):
-#     tasks = []
-#     for i, soup in enumerate(soups):
-#         if soup:
-#             page_content = soup.text
-#             tasks.append(
-#                 matchingScoreAsync(sentences, page_content, i, ScoreArray)
-#             )
-#         else:
-#             print(
-#                 f"Analyzed {i+1} of soups (SOUP FAILED)........................"
-#             )
-#     ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
-#     return ScoreArray
-# async def parallel_analyze_2(soups, sentences, ScoreArray):
-#     tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
-#     for i, soup in enumerate(soups):
-#         if soup:
-#             page_content = soup.text
-#             for j, sent in enumerate(sentences):
-#                 print(
-#                     f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
-#                 )
-#                 tasks[i][j] = sentence_similarity(sent, page_content)
-#         else:
-#             print(
-#                 f"Analyzed {i+1} of soups (SOUP FAILED)........................"
-#             )
-#     ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
-#     return ScoreArray

     text = remove_accents(text)
     pattern = r'[^\w\s\d.,!?\'"()-;]+'
     text = re.sub(pattern, "", text)
+    text = text.replace("<s>", "").replace("</s>", "")
     return text
 WORD = re.compile(r"\w+")
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")