Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

minko186 commited on Apr 12, 2024

Commit

c0a6bc9

1 Parent(s): e3b9187

change highlight from gradio to html

Browse files

Files changed (2) hide show

app.py +34 -14
plagiarism.py +135 -58

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from datetime import date
 from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
 from analysis import depth_analysis
 from predictors import predict_quillbot
-from plagiarism import plagiarism_check, build_date
 from highlighter import analyze_and_highlight
 from utils import extract_text_from_pdf, len_validator
 import yaml
@@ -20,7 +20,9 @@ model_list = params["MC_OUTPUT_LABELS"]
 analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
-analyze_and_highlight_quillbot = partial(analyze_and_highlight, model_type="quillbot")
 def ai_generated_test(option, input, models):
@@ -46,7 +48,18 @@ def main(
     domains_to_skip,
 ):
-    formatted_tokens = plagiarism_check(
         plag_option,
         input,
         year_from,
@@ -211,15 +224,19 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            sentenceBreakdown = gr.HighlightedText(
                 label="Source Detection Sentence Breakdown",
-                combine_adjacent=True,
-                color_map={
-                    "[1]": "red",
-                    "[2]": "orange",
-                    "[3]": "yellow",
-                    "[4]": "green",
-                },
             )
     with gr.Row():
@@ -268,7 +285,8 @@ with gr.Blocks() as demo:
     )
     only_plagiarism_btn.click(
-        fn=plagiarism_check,
         inputs=[
             plag_option,
             input_text,
@@ -311,5 +329,7 @@ with gr.Blocks() as demo:
     date_to = ""
-if __name__ == "__main__":
-    demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))

 from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
 from analysis import depth_analysis
 from predictors import predict_quillbot
+from plagiarism import plagiarism_check, build_date, html_highlight
 from highlighter import analyze_and_highlight
 from utils import extract_text_from_pdf, len_validator
 import yaml
 analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
+analyze_and_highlight_quillbot = partial(
+    analyze_and_highlight, model_type="quillbot"
+)
 def ai_generated_test(option, input, models):
     domains_to_skip,
 ):
+    # formatted_tokens = plagiarism_check(
+    #     plag_option,
+    #     input,
+    #     year_from,
+    #     month_from,
+    #     day_from,
+    #     year_to,
+    #     month_to,
+    #     day_to,
+    #     domains_to_skip,
+    # )
+    formatted_tokens = html_highlight(
         plag_option,
         input,
         year_from,
     with gr.Row():
         with gr.Column():
+            # sentenceBreakdown = gr.HighlightedText(
+            #     label="Source Detection Sentence Breakdown",
+            #     combine_adjacent=True,
+            #     color_map={
+            #         "[1]": "red",
+            #         "[2]": "orange",
+            #         "[3]": "yellow",
+            #         "[4]": "green",
+            #     },
+            # )
+            sentenceBreakdown = gr.HTML(
                 label="Source Detection Sentence Breakdown",
+                value="Source Detection Sentence Breakdown",
             )
     with gr.Row():
     )
     only_plagiarism_btn.click(
+        # fn=plagiarism_check,
+        fn=html_highlight,
         inputs=[
             plag_option,
             input_text,
     date_to = ""
+if __name__ == "__main__":
+    demo.launch(
+        share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
+    )

plagiarism.py CHANGED Viewed

@@ -20,6 +20,7 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 # input: two vectors
 # output: integer between 0 and 1.
 def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
@@ -75,9 +76,9 @@ def sentence_similarity(text1, text2):
 def google_search(
     plag_option,
     sentences,
-    urlCount,
-    scoreArray,
-    urlList,
     sorted_date,
     domains_to_skip,
     api_key,
@@ -112,19 +113,19 @@ def google_search(
                 # update cosine similarity between snippet and given text
                 url = link["link"]
-                if url not in urlList:
-                    urlList.append(url)
-                    scoreArray.append([0] * len(sentences))
-                urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
                 if plag_option == "Standard":
-                    scoreArray[urlList.index(url)][i] = cosineSim(
                         sentence, snippet
                     )
                 else:
-                    scoreArray[urlList.index(url)][i] = sentence_similarity(
                         sentence, snippet
                     )
-    return urlCount, scoreArray
 def split_sentence_blocks(text):
@@ -191,7 +192,6 @@ async def parallel_scrap(urls):
     return results
 def matching_score(sentence_content_tuple):
     sentence, content = sentence_content_tuple
     if sentence in content:
@@ -204,11 +204,65 @@ def matching_score(sentence_content_tuple):
         matched = [x for x in ngrams if " ".join(x) in content]
         return len(matched) / len(ngrams)
 def process_with_multiprocessing(input_data):
-    with Pool(processes=4) as pool:
         scores = pool.map(matching_score, input_data)
     return scores
 def plagiarism_check(
     plag_option,
     input,
@@ -227,41 +281,44 @@ def plagiarism_check(
     api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
     sentences = split_sentence_blocks(input)
-    urlCount = {}
-    ScoreArray = []
-    urlList = []
     date_from = build_date(year_from, month_from, day_from)
     date_to = build_date(year_to, month_to, day_to)
     sort_date = f"date:r:{date_from}:{date_to}"
     # get list of URLS to check
-    urlCount, ScoreArray = google_search(
         plag_option,
         sentences,
-        urlCount,
-        ScoreArray,
-        urlList,
         sort_date,
         domains_to_skip,
         api_key,
         cse_id,
     )
     # Scrape URLs in list
     formatted_tokens = []
-    soups = asyncio.run(parallel_scrap(urlList))
     # # Populate matching scores for scrapped pages
     # for i, soup in enumerate(soups):
     #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
     #     if soup:
     #         page_content = soup.text
     #         for j, sent in enumerate(sentences):
     #             args_list = (sent, page_content)
     #             score = matching_score(args_list)
     #             # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
-    #             ScoreArray[i][j] = score
     input_data = []
     for i, soup in enumerate(soups):
@@ -269,69 +326,89 @@ def plagiarism_check(
             page_content = soup.text
             for j, sent in enumerate(sentences):
                 input_data.append((sent, page_content))
     scores = process_with_multiprocessing(input_data)
-    k = 0
     for i, soup in enumerate(soups):
         if soup:
             for j, _ in enumerate(sentences):
-                ScoreArray[i][j] = scores[k]
-                k += 1
     sentenceToMaxURL = [-1] * len(sentences)
     for j in range(len(sentences)):
         if j > 0:
-            maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
             sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
         else:
             maxScore = -1
-        for i in range(len(ScoreArray)):
             margin = (
-                0.1
                 if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
                 else 0
             )
-            if ScoreArray[i][j] - maxScore > margin:
-                maxScore = ScoreArray[i][j]
                 sentenceToMaxURL[j] = i
     index = np.unique(sentenceToMaxURL)
-    urlScore = {}
     for url in index:
         s = [
-            ScoreArray[url][sen]
             for sen in range(len(sentences))
             if sentenceToMaxURL[sen] == url
         ]
-        urlScore[url] = sum(s) / len(s)
-    index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
     urlMap = {}
     for count, i in enumerate(index_descending):
         urlMap[i] = count + 1
     for i, sent in enumerate(sentences):
-        formatted_tokens.append(
-            (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
-        )
-    formatted_tokens.append(("\n", None))
-    formatted_tokens.append(("\n", None))
-    formatted_tokens.append(("\n", None))
-    for ind in index_descending:
-        formatted_tokens.append(
-            (
-                urlList[ind]
-                + " --- Matching Score: "
-                + f"{str(round(urlScore[ind] * 100, 2))}%",
-                "[" + str(urlMap[ind]) + "]",
             )
         )
-        formatted_tokens.append(("\n", None))
-    return formatted_tokens

 # input: two vectors
 # output: integer between 0 and 1.
 def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
 def google_search(
     plag_option,
     sentences,
+    url_count,
+    score_array,
+    url_list,
     sorted_date,
     domains_to_skip,
     api_key,
                 # update cosine similarity between snippet and given text
                 url = link["link"]
+                if url not in url_list:
+                    url_list.append(url)
+                    score_array.append([0] * len(sentences))
+                url_count[url] = url_count[url] + 1 if url in url_count else 1
                 if plag_option == "Standard":
+                    score_array[url_list.index(url)][i] = cosineSim(
                         sentence, snippet
                     )
                 else:
+                    score_array[url_list.index(url)][i] = sentence_similarity(
                         sentence, snippet
                     )
+    return url_count, score_array
 def split_sentence_blocks(text):
     return results
 def matching_score(sentence_content_tuple):
     sentence, content = sentence_content_tuple
     if sentence in content:
         matched = [x for x in ngrams if " ".join(x) in content]
         return len(matched) / len(ngrams)
 def process_with_multiprocessing(input_data):
+    with Pool(processes=1) as pool:
         scores = pool.map(matching_score, input_data)
     return scores
+def print2d(array):
+    for row in array:
+        print(row)
+def html_highlight(
+    plag_option,
+    input,
+    year_from,
+    month_from,
+    day_from,
+    year_to,
+    month_to,
+    day_to,
+    domains_to_skip,
+):
+    sentence_scores, url_scores = plagiarism_check(
+        plag_option,
+        input,
+        year_from,
+        month_from,
+        day_from,
+        year_to,
+        month_to,
+        day_to,
+        domains_to_skip,
+    )
+    color_map = [
+        "#e06b63",
+        "#eb9d59",
+        "#c2ad36",
+        "#e1ed72",
+        "#c2db76",
+        "#a2db76",
+    ]
+    html_content = "<div style='font-family: Roboto; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
+    for sentence, _, _, idx in sentence_scores:
+        color = color_map[idx - 1]
+        formatted_sentence = f'<p style="background-color: {color}; padding: 5px;">{sentence} [{idx}]</p>'
+        html_content += formatted_sentence
+    html_content += "<hr>"
+    for url, score, idx in url_scores:
+        color = color_map[idx - 1]
+        formatted_name = f'<p style="background-color: {color}; padding: 5px;">({idx}) {url} --- Matching Score:{score}</p>'
+        html_content += formatted_name
+    html_content += "</div>"
+    return html_content
 def plagiarism_check(
     plag_option,
     input,
     api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
+    url_scores = []
+    sentence_scores = []
+    # for input in input.split("\n\n"):
+    print(input)
     sentences = split_sentence_blocks(input)
+    url_count = {}
+    score_array = []
+    url_list = []
     date_from = build_date(year_from, month_from, day_from)
     date_to = build_date(year_to, month_to, day_to)
     sort_date = f"date:r:{date_from}:{date_to}"
     # get list of URLS to check
+    url_count, score_array = google_search(
         plag_option,
         sentences,
+        url_count,
+        score_array,
+        url_list,
         sort_date,
         domains_to_skip,
         api_key,
         cse_id,
     )
     # Scrape URLs in list
     formatted_tokens = []
+    soups = asyncio.run(parallel_scrap(url_list))
     # # Populate matching scores for scrapped pages
     # for i, soup in enumerate(soups):
     #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
     #     if soup:
     #         page_content = soup.text
     #         for j, sent in enumerate(sentences):
     #             args_list = (sent, page_content)
     #             score = matching_score(args_list)
     #             # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
+    #             score_array[i][j] = score
     input_data = []
     for i, soup in enumerate(soups):
             page_content = soup.text
             for j, sent in enumerate(sentences):
                 input_data.append((sent, page_content))
     scores = process_with_multiprocessing(input_data)
+    k = 0
+    # Update score array for each (soup, sentence)
     for i, soup in enumerate(soups):
         if soup:
             for j, _ in enumerate(sentences):
+                score_array[i][j] = scores[k]
+                k += 1
+    # Map sentence with max URL with small margin to keep consider same URL
+    # for consecutive sentences
     sentenceToMaxURL = [-1] * len(sentences)
     for j in range(len(sentences)):
         if j > 0:
+            maxScore = score_array[sentenceToMaxURL[j - 1]][j]
             sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
         else:
             maxScore = -1
+        for i in range(len(score_array)):
             margin = (
+                0.05
                 if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
                 else 0
             )
+            if score_array[i][j] - maxScore > margin:
+                maxScore = score_array[i][j]
                 sentenceToMaxURL[j] = i
+            # if score_array[i][j] > maxScore:
+            #     maxScore = score_array[i][j]
+            #     sentenceToMaxURL[j] = i
     index = np.unique(sentenceToMaxURL)
+    url_source = {}
     for url in index:
         s = [
+            score_array[url][sen]
             for sen in range(len(sentences))
             if sentenceToMaxURL[sen] == url
         ]
+        url_source[url] = sum(s) / len(s)
+    index_descending = sorted(url_source, key=url_source.get, reverse=True)
     urlMap = {}
     for count, i in enumerate(index_descending):
         urlMap[i] = count + 1
     for i, sent in enumerate(sentences):
+        ind = sentenceToMaxURL[i]
+        if url_source[ind] > 0.1:
+            sentence_scores.append(
+                [sent, url_source[ind], url_list[ind], urlMap[ind]]
             )
+        else:
+            sentence_scores.append([sent, None, url_list[ind], urlMap[ind]])
+    for ind in index_descending:
+        url_scores.append(
+            [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
         )
+    return sentence_scores, url_scores
+    # for i, sent in enumerate(sentences):
+    #     formatted_tokens.append(
+    #         (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
+    #     )
+    # formatted_tokens.append(("\n", None))
+    # formatted_tokens.append(("\n", None))
+    # formatted_tokens.append(("\n", None))
+    # for ind in index_descending:
+    #     formatted_tokens.append(
+    #         (
+    #             url_list[ind]
+    #             + " --- Matching Score: "
+    #             + f"{str(round(url_source[ind] * 100, 2))}%",
+    #             "[" + str(urlMap[ind]) + "]",
+    #         )
+    #     )
+    #     formatted_tokens.append(("\n", None))
+    # return formatted_tokens