import time from nltk.tokenize import sent_tokenize from googleapiclient.discovery import build from collections import Counter import re, math from sentence_transformers import SentenceTransformer, util import asyncio import httpx from bs4 import BeautifulSoup import numpy as np import concurrent from multiprocessing import Pool from const import url_types from collections import defaultdict WORD = re.compile(r"\w+") model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") months = { "January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12", } color_map = [ "#cf2323", "#d65129", "#d66329", "#d67129", "#eb9d59", "#c2ad36", "#d6ae29", "#d6b929", "#e1ed72", "#c2db76", "#a2db76", ] def text_to_vector(text): words = WORD.findall(text) return Counter(words) def cosineSim(text1, text2): vector1 = text_to_vector(text1) vector2 = text_to_vector(text2) # print vector1,vector2 cosine = get_cosine(vector1, vector2) return cosine def get_cosine(vec1, vec2): intersection = set(vec1.keys()) & set(vec2.keys()) numerator = sum([vec1[x] * vec2[x] for x in intersection]) sum1 = sum([vec1[x] ** 2 for x in vec1.keys()]) sum2 = sum([vec2[x] ** 2 for x in vec2.keys()]) denominator = math.sqrt(sum1) * math.sqrt(sum2) if denominator == 0: return 0.0 else: return float(numerator) / denominator def split_sentence_blocks(text, size): if size == "Paragraph": blocks = text.strip().split("\n") return blocks else: sents = sent_tokenize(text.strip()) return sents def build_date(year=2024, month="March", day=1): return f"{year}{months[month]}{day}" def split_ngrams(text, n): words = text.split() return [tuple(words[i : i + n]) for i in range(len(words) - n + 1)] def sentence_similarity(text1, text2): embedding_1 = model.encode(text1, convert_to_tensor=True) embedding_2 = model.encode(text2, convert_to_tensor=True) o = util.pytorch_cos_sim(embedding_1, embedding_2) return o.item() async def get_url_data(url, client): try: r = await client.get(url) if r.status_code == 200: soup = BeautifulSoup(r.content, "html.parser") return soup except Exception: return None async def parallel_scrap(urls): async with httpx.AsyncClient(timeout=30) as client: tasks = [] for url in urls: tasks.append(get_url_data(url=url, client=client)) results = await asyncio.gather(*tasks, return_exceptions=True) return results def merge_ngrams_into_sentence(ngrams): if ngrams == None: return "" if len(ngrams) > 20: ngrams = ngrams[:20] merged_sentence = [] i = 0 for ngram in ngrams: overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :])) if overlap == 0: merged_sentence.extend(ngram) elif overlap < len(ngram): merged_sentence.extend(ngram[overlap:]) return " ".join(merged_sentence) def remove_ngrams_after(ngrams, target_ngram): try: index = ngrams.index(target_ngram) return ngrams[: index + 1] except ValueError: return None def matching_score(sentence_content_tuple): sentence, content, score = sentence_content_tuple if sentence in content: return 1, sentence # if score > 0.9: # return score else: n = 5 # ngrams = split_ngrams(sentence, n) # if len(ngrams) == 0: # return 0 # matched = [x for x in ngrams if " ".join(x) in content] # return len(matched) / len(ngrams) # list comprehension matching # ngrams_sentence = split_ngrams(sentence, n) # ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)] # if len(ngrams_sentence) == 0: # return 0, "" # matched_ngrams = [ # 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content # ] # matched_count = sum(matched_ngrams) # set intersection matching ngrams_sentence = set(split_ngrams(sentence, n)) ngrams_content = set(split_ngrams(content, n)) if len(ngrams_sentence) == 0: return 0, "" matched_ngrams = ngrams_sentence.intersection(ngrams_content) matched_count = len(matched_ngrams) # matched content matched_content_ngrams = [] found = False last_found = None for ngram in ngrams_sentence: for ngram_content in ngrams_content: if tuple(ngram) == ngram_content: found = True last_found = ngram_content if found: matched_content_ngrams.append(ngram_content) matched_content_ngrams = remove_ngrams_after( matched_content_ngrams, last_found ) matched_content = merge_ngrams_into_sentence(matched_content_ngrams) return matched_count / len(ngrams_sentence), matched_content def process_with_multiprocessing(input_data): with Pool(processes=8) as pool: scores = pool.map(matching_score, input_data) return scores def map_sentence_url(sentences, score_array): sentenceToMaxURL = [-1] * len(sentences) for j in range(len(sentences)): if j > 0: maxScore = score_array[sentenceToMaxURL[j - 1]][j] sentenceToMaxURL[j] = sentenceToMaxURL[j - 1] else: maxScore = -1 for i in range(len(score_array)): margin = ( 0.05 if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1]) else 0 ) if score_array[i][j] - maxScore > margin: maxScore = score_array[i][j] sentenceToMaxURL[j] = i return sentenceToMaxURL def check_url_category(url): for category, urls in url_types.items(): for u in urls: if u in url: return category return "Internet Source" def google_search( plag_option, sentences, url_count, score_array, url_list, snippets, sorted_date, domains_to_skip, api_key, cse_id, **kwargs, ): service = build("customsearch", "v1", developerKey=api_key) num_pages = 1 for i, sentence in enumerate(sentences): results = ( service.cse() .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs) .execute() ) if "items" in results and len(results["items"]) > 0: for count, link in enumerate(results["items"]): if count >= num_pages: break # skip user selected domains if (domains_to_skip is not None) and any( ("." + domain) in link["link"] for domain in domains_to_skip ): continue # clean up snippet of '...' snippet = link["snippet"] ind = snippet.find("...") if ind < 20 and ind > 9: snippet = snippet[ind + len("... ") :] ind = snippet.find("...") if ind > len(snippet) - 5: snippet = snippet[:ind] # update cosine similarity between snippet and given text url = link["link"] if url not in url_list: url_list.append(url) score_array.append([0] * len(sentences)) snippets.append([""] * len(sentences)) url_count[url] = url_count[url] + 1 if url in url_count else 1 snippets[url_list.index(url)][i] = snippet if plag_option == "Standard": score_array[url_list.index(url)][i] = cosineSim( sentence, snippet ) else: score_array[url_list.index(url)][i] = sentence_similarity( sentence, snippet ) return url_count, score_array def plagiarism_check( plag_option, input, year_from, month_from, day_from, year_to, month_to, day_to, domains_to_skip, source_block_size, ): # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g" # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8" api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g" # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE" # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk" # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg" # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8" cse_id = "851813e81162b4ed4" url_scores = [] sentence_scores = [] sentences = split_sentence_blocks(input, source_block_size) url_count = {} score_array = [] url_list = [] snippets = [] date_from = build_date(year_from, month_from, day_from) date_to = build_date(year_to, month_to, day_to) sort_date = f"date:r:{date_from}:{date_to}" # get list of URLS to check start_time = time.perf_counter() url_count, score_array = google_search( plag_option, sentences, url_count, score_array, url_list, snippets, sort_date, domains_to_skip, api_key, cse_id, ) print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time) # Scrape URLs in list start_time = time.perf_counter() soups = asyncio.run(parallel_scrap(url_list)) print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time) input_data = [] for i, soup in enumerate(soups): if soup: page_content = soup.text for j, sent in enumerate(sentences): input_data.append((sent, page_content, score_array[i][j])) start_time = time.perf_counter() scores = process_with_multiprocessing(input_data) print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time) matched_sentence_array = [ ["" for _ in range(len(score_array[0]))] for _ in range(len(score_array)) ] k = 0 # Update score array for each (soup, sentence) for i, soup in enumerate(soups): if soup: for j, _ in enumerate(sentences): score_array[i][j] = scores[k][0] matched_sentence_array[i][j] = scores[k][1] k += 1 sentenceToMaxURL = map_sentence_url(sentences, score_array) index = np.unique(sentenceToMaxURL) url_source = {} for url in index: s = [ score_array[url][sen] for sen in range(len(sentences)) if sentenceToMaxURL[sen] == url ] url_source[url] = sum(s) / len(s) index_descending = sorted(url_source, key=url_source.get, reverse=True) urlMap = {} for count, i in enumerate(index_descending): urlMap[i] = count + 1 # build results for i, sent in enumerate(sentences): ind = sentenceToMaxURL[i] if url_source[ind] > 0.1: sentence_scores.append( [ sent, round(url_source[ind] * 100, 2), url_list[ind], urlMap[ind], ] ) else: sentence_scores.append([sent, None, url_list[ind], -1]) print("SNIPPETS: ", snippets) snippets = [[item for item in sublist if item] for sublist in snippets] for ind in index_descending: if url_source[ind] > 0.1: matched_sentence_array = [ [item for item in sublist if item] for sublist in matched_sentence_array ] matched_sentence = "...".join( [sent for sent in matched_sentence_array[ind]] ) if matched_sentence == "": matched_sentence = "...".join([sent for sent in snippets[ind]]) url_scores.append( [ url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind], matched_sentence, ] ) return sentence_scores, url_scores def html_highlight( plag_option, input, year_from, month_from, day_from, year_to, month_to, day_to, domains_to_skip, source_block_size, ): start_time = time.perf_counter() sentence_scores, url_scores = plagiarism_check( plag_option, input, year_from, month_from, day_from, year_to, month_to, day_to, domains_to_skip, source_block_size, ) html_content = """
{combined_sentence} {index_part}
' html_content += formatted_sentence combined_sentence = "" combined_sentence += " " + sentence prev_idx = idx print(category_scores) total_average_score = round(total_score / total_count, 2) category_averages = { category: round((sum(scores) / len(scores)), 2) for category, scores in category_scores.items() } if combined_sentence: color = color_map[prev_idx - 1] index_part = "" if prev_idx != -1: index_part = f"[{prev_idx}]" formatted_sentence = f'{combined_sentence} {index_part}
' html_content += formatted_sentence html_content += "{total_average_score}%
{score}%
[{idx}] {url}
{url_category}
--- Matching Score: {score}%
--- Original Source Content: {sentence}
""" # formatted_url = f""" # #--- Matching Score: {score}%
#--- Original Source Content: {sentence}
#