Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

aliasgerovs commited on Jun 12, 2024

Commit

0eaca07

1 Parent(s): 173f4a0

Updated

Browse files

Files changed (6) hide show

.env +1 -0
README.md +1 -1
app.py +35 -1
const.py +97 -0
nohup.out +0 -0
plagiarism.py +197 -27

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ ASSEMBLYAI_API_KEY = 'f9d0fe8c23304ae193d694294b615dcc'

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: ©
 colorFrom: gray
 colorTo: pink
 sdk: gradio
-sdk_version: 4.17.0
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: gray
 colorTo: pink
 sdk: gradio
+sdk_version: 4.36.0
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -13,6 +13,9 @@ from functools import partial
 from audio import assemblyai_transcribe
 import yt_dlp
 import os
 np.set_printoptions(suppress=True)
@@ -28,6 +31,21 @@ analyze_and_highlight_quillbot = partial(
 )
 def ai_generated_test(option, bias_buster_selected, input):
     if bias_buster_selected:
         input = update(input)
@@ -118,6 +136,14 @@ with gr.Blocks() as demo:
         )
     with gr.Row():
         url_input = gr.Textbox(
             label="Input Page URL to check", lines=1, placeholder="")
@@ -128,7 +154,15 @@ with gr.Blocks() as demo:
         audio_url_input.change(
             fn=assemblyai_transcribe, inputs=audio_url_input, outputs=input_text
         )
     char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)

 from audio import assemblyai_transcribe
 import yt_dlp
 import os
+import pandas as pd
+from const import plag_script
+from datasets import load_dataset, Dataset
 np.set_printoptions(suppress=True)
 )
+def save_request(email, video_url):
+    # Save the email and video URL to the CSV file
+    if email is None or email == "":
+        return "Please enter your email.", gr.update(visible=True)
+    dat = load_dataset(requests_repo)["train"]
+    df = dat.to_pandas()
+    new_row = pd.DataFrame(
+        {"email": [email], "video_url": [video_url], "status": "pending"}
+    )
+    df = pd.concat([df, new_row], ignore_index=True)
+    dat = Dataset.from_pandas(df)
+    dat.push_to_hub(requests_repo)
+    return "Your request has been saved.", gr.update(visible=False)
 def ai_generated_test(option, bias_buster_selected, input):
     if bias_buster_selected:
         input = update(input)
         )
+    with gr.Column(visible=False) as request_row:
+        with gr.Row():
+            email_input = gr.Textbox(label="Email")
+            youtube_url_input = gr.Textbox(label="YouTube Video URL")
+        with gr.Row():
+            video_submit_btn = gr.Button("Submit Video Request")
     with gr.Row():
         url_input = gr.Textbox(
             label="Input Page URL to check", lines=1, placeholder="")
         audio_url_input.change(
             fn=assemblyai_transcribe, inputs=audio_url_input, outputs=input_text
         )
+    video_submit_btn.click(
+        fn=save_request,
+        inputs=[email_input, youtube_url_input],
+        outputs=[input_text, request_row],
+        api_name="video_request",
+    )
     char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)

const.py ADDED Viewed

	@@ -0,0 +1,97 @@

+url_types = {
+    "Student Publications": [
+        "studentpulse.com",
+        "undergraduateresearch.ucsd.edu",
+        "jmurj.jmu.edu",
+        "cur.org",
+        "urj.ucf.edu",
+        "ugresearch.umn.edu",
+        "undergraduateresearch.wustl.edu",
+        "ugresearch.ucla.edu",
+        "jur.byu.edu",
+        "undergradresearch.ncsu.edu",
+    ],
+    "Wikihost": [
+        "fandom.com",
+        "wikidot.com",
+        "wikia.org",
+        "wikispaces.com",
+        "gamepedia.com",
+        "wikibooks.org",
+        "wikiversity.org",
+        "wikitravel.org",
+        "wikinews.org",
+        "wiktionary.org",
+        "wikipedia.org",
+    ],
+    "Official News": [
+        "bbc.com",
+        "cnn.com",
+        "nytimes.com",
+        "reuters.com",
+        "theguardian.com",
+        "washingtonpost.com",
+        "foxnews.com",
+        "aljazeera.com",
+        "bloomberg.com",
+        "npr.org",
+    ],
+    "Online Learning": [
+        "coursera.org",
+        "edx.org",
+        "udacity.com",
+        "udemy.com",
+        "khanacademy.org",
+        "futurelearn.com",
+        "skillshare.com",
+        "linkedin.com/learning",
+        "pluralsight.com",
+        "codecademy.com",
+    ],
+    "Government Official": [
+        "usa.gov",
+        "gov.uk",
+        "europa.eu",
+        "canada.ca",
+        "australia.gov.au",
+        "india.gov.in",
+        "japan.go.jp",
+        "korea.go.kr",
+        "gov.sg",
+        "nz.govt.nz",
+        "defense.gov",
+        ".gov",
+    ],
+    "Publications": [
+        "scholar.google.com",
+        "pubmed.ncbi.nlm.nih.gov",
+        "researchgate.net",
+        "jstor.org",
+        "ieeexplore.ieee.org",
+        "sciencedirect.com",
+        "arxiv.org",
+        "link.springer.com",
+        "onlinelibrary.wiley.com",
+        "doaj.org",
+        "journals.plos.org/plosone",
+        "journals.sagepub.com",
+        "dl.acm.org",
+        "biorxiv.org",
+        "tandfonline.com",
+    ],
+}
+plag_script = """
+async () => {
+    globalThis.toggleDetails = (event) => {
+        event.preventDefault(); // Prevent the default link behavior
+        let detailsContainer = document.getElementById("detailsContainer");
+        if (detailsContainer.style.display === "none") {
+            detailsContainer.style.display = "block";
+        } else {
+            detailsContainer.style.display = "none";
+        }
+    }
+}
+"""

nohup.out CHANGED Viewed

The diff for this file is too large to render. See raw diff

plagiarism.py CHANGED Viewed

@@ -10,7 +10,8 @@ from bs4 import BeautifulSoup
 import numpy as np
 import concurrent
 from multiprocessing import Pool
 WORD = re.compile(r"\w+")
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
@@ -73,10 +74,10 @@ def get_cosine(vec1, vec2):
 def split_sentence_blocks(text, size):
     if size == "Paragraph":
-        blocks = text.split("\n")
         return blocks
     else:
-        sents = sent_tokenize(text)
         return sents
@@ -115,12 +116,36 @@ async def parallel_scrap(urls):
     return results
 def matching_score(sentence_content_tuple):
     sentence, content, score = sentence_content_tuple
     if sentence in content:
-        return 1
-    if score > 0.9:
-        return score
     else:
         n = 5
@@ -132,12 +157,28 @@ def matching_score(sentence_content_tuple):
         ngrams_sentence = split_ngrams(sentence, n)
         if len(ngrams_sentence) == 0:
-            return 0
-        ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
-        matched_count = sum(
-            1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
         )
-        return matched_count / len(ngrams_sentence)
 def process_with_multiprocessing(input_data):
@@ -166,12 +207,21 @@ def map_sentence_url(sentences, score_array):
     return sentenceToMaxURL
 def google_search(
     plag_option,
     sentences,
     url_count,
     score_array,
     url_list,
     sorted_date,
     domains_to_skip,
     api_key,
@@ -209,7 +259,9 @@ def google_search(
                 if url not in url_list:
                     url_list.append(url)
                     score_array.append([0] * len(sentences))
                 url_count[url] = url_count[url] + 1 if url in url_count else 1
                 if plag_option == "Standard":
                     score_array[url_list.index(url)][i] = cosineSim(
                         sentence, snippet
@@ -234,21 +286,22 @@ def plagiarism_check(
     source_block_size,
 ):
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
-    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
     # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
     # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
     url_scores = []
     sentence_scores = []
     sentences = split_sentence_blocks(input, source_block_size)
-    print(sentences)
     url_count = {}
     score_array = []
     url_list = []
     date_from = build_date(year_from, month_from, day_from)
     date_to = build_date(year_to, month_to, day_to)
     sort_date = f"date:r:{date_from}:{date_to}"
@@ -259,6 +312,7 @@ def plagiarism_check(
         url_count,
         score_array,
         url_list,
         sort_date,
         domains_to_skip,
         api_key,
@@ -273,13 +327,18 @@ def plagiarism_check(
             for j, sent in enumerate(sentences):
                 input_data.append((sent, page_content, score_array[i][j]))
     scores = process_with_multiprocessing(input_data)
     k = 0
     # Update score array for each (soup, sentence)
     for i, soup in enumerate(soups):
         if soup:
             for j, _ in enumerate(sentences):
-                score_array[i][j] = scores[k]
                 k += 1
     sentenceToMaxURL = map_sentence_url(sentences, score_array)
@@ -303,14 +362,35 @@ def plagiarism_check(
         ind = sentenceToMaxURL[i]
         if url_source[ind] > 0.1:
             sentence_scores.append(
-                [sent, url_source[ind], url_list[ind], urlMap[ind]]
             )
         else:
             sentence_scores.append([sent, None, url_list[ind], -1])
     for ind in index_descending:
-        if url_source[ind] > 0.1:
             url_scores.append(
-                [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
             )
     return sentence_scores, url_scores
@@ -342,33 +422,123 @@ def html_highlight(
         source_block_size,
     )
-    html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>"
     prev_idx = None
     combined_sentence = ""
-    for sentence, _, _, idx in sentence_scores:
         if idx != prev_idx and prev_idx is not None:
             color = color_map[prev_idx - 1]
-            index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
-            formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
             html_content += formatted_sentence
             combined_sentence = ""
         combined_sentence += " " + sentence
         prev_idx = idx
     if combined_sentence:
         color = color_map[prev_idx - 1]
-        index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
-        formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
         html_content += formatted_sentence
     html_content += "<hr>"
-    for url, score, idx in url_scores:
         color = color_map[idx - 1]
-        formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
         html_content += formatted_url
-    html_content += "</div>"
     print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
-    return html_content

 import numpy as np
 import concurrent
 from multiprocessing import Pool
+from const import url_types
+from collections import defaultdict
 WORD = re.compile(r"\w+")
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 def split_sentence_blocks(text, size):
     if size == "Paragraph":
+        blocks = text.strip().split("\n")
         return blocks
     else:
+        sents = sent_tokenize(text.strip())
         return sents
     return results
+def merge_ngrams_into_sentence(ngrams):
+    if ngrams == None:
+        return ""
+    if len(ngrams) > 20:
+        ngrams = ngrams[:20]
+    merged_sentence = []
+    i = 0
+    for ngram in ngrams:
+        overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :]))
+        if overlap == 0:
+            merged_sentence.extend(ngram)
+        elif overlap < len(ngram):
+            merged_sentence.extend(ngram[overlap:])
+    return " ".join(merged_sentence)
+def remove_ngrams_after(ngrams, target_ngram):
+    try:
+        index = ngrams.index(target_ngram)
+        return ngrams[: index + 1]
+    except ValueError:
+        return None
 def matching_score(sentence_content_tuple):
     sentence, content, score = sentence_content_tuple
     if sentence in content:
+        return 1, sentence
+    # if score > 0.9:
+    #     return score
     else:
         n = 5
         ngrams_sentence = split_ngrams(sentence, n)
         if len(ngrams_sentence) == 0:
+            return 0, ""
+        ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
+        matched_content_ngrams = []
+        found = False
+        last_found = None
+        for ngram in ngrams_sentence:
+            for ngram_content in ngrams_content:
+                if tuple(ngram) == ngram_content:
+                    found = True
+                    last_found = ngram_content
+                if found:
+                    matched_content_ngrams.append(ngram_content)
+        matched_content_ngrams = remove_ngrams_after(
+            matched_content_ngrams, last_found
         )
+        matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
+        matched_ngrams = [
+            1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
+        ]
+        matched_count = sum(matched_ngrams)
+        return matched_count / len(ngrams_sentence), matched_content
 def process_with_multiprocessing(input_data):
     return sentenceToMaxURL
+def check_url_category(url):
+    for category, urls in url_types.items():
+        for u in urls:
+            if u in url:
+                return category
+    return "Internet Source"
 def google_search(
     plag_option,
     sentences,
     url_count,
     score_array,
     url_list,
+    snippets,
     sorted_date,
     domains_to_skip,
     api_key,
                 if url not in url_list:
                     url_list.append(url)
                     score_array.append([0] * len(sentences))
+                    snippets.append([""] * len(sentences))
                 url_count[url] = url_count[url] + 1 if url in url_count else 1
+                snippets[url_list.index(url)][i] = snippet
                 if plag_option == "Standard":
                     score_array[url_list.index(url)][i] = cosineSim(
                         sentence, snippet
     source_block_size,
 ):
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
+    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
     # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
     # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
+    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
     cse_id = "851813e81162b4ed4"
     url_scores = []
     sentence_scores = []
     sentences = split_sentence_blocks(input, source_block_size)
     url_count = {}
     score_array = []
     url_list = []
+    snippets = []
     date_from = build_date(year_from, month_from, day_from)
     date_to = build_date(year_to, month_to, day_to)
     sort_date = f"date:r:{date_from}:{date_to}"
         url_count,
         score_array,
         url_list,
+        snippets,
         sort_date,
         domains_to_skip,
         api_key,
             for j, sent in enumerate(sentences):
                 input_data.append((sent, page_content, score_array[i][j]))
     scores = process_with_multiprocessing(input_data)
+    matched_sentence_array = [
+        ["" for _ in range(len(score_array[0]))]
+        for _ in range(len(score_array))
+    ]
     k = 0
     # Update score array for each (soup, sentence)
     for i, soup in enumerate(soups):
         if soup:
             for j, _ in enumerate(sentences):
+                score_array[i][j] = scores[k][0]
+                matched_sentence_array[i][j] = scores[k][1]
                 k += 1
     sentenceToMaxURL = map_sentence_url(sentences, score_array)
         ind = sentenceToMaxURL[i]
         if url_source[ind] > 0.1:
             sentence_scores.append(
+                [
+                    sent,
+                    round(url_source[ind] * 100, 2),
+                    url_list[ind],
+                    urlMap[ind],
+                ]
             )
         else:
             sentence_scores.append([sent, None, url_list[ind], -1])
+    print("SNIPPETS: ", snippets)
+    snippets = [[item for item in sublist if item] for sublist in snippets]
     for ind in index_descending:
+        if url_source[ind] > 0.35:
+            matched_sentence_array = [
+                [item for item in sublist if item]
+                for sublist in matched_sentence_array
+            ]
+            matched_sentence = "...".join(
+                [sent for sent in matched_sentence_array[ind]]
+            )
+            if matched_sentence == "":
+                matched_sentence = "...".join([sent for sent in snippets[ind]])
             url_scores.append(
+                [
+                    url_list[ind],
+                    round(url_source[ind] * 100, 2),
+                    urlMap[ind],
+                    matched_sentence,
+                ]
             )
     return sentence_scores, url_scores
         source_block_size,
     )
+    html_content = """
+        <link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>
+        <div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>
+        <html>
+        <head>
+            <title>Toggle Details</title>
+            <style>
+                .score-container {
+                    display: flex;
+                    justify-content: space-around;
+                    align-items: left;
+                    padding: 20px;
+                }
+                .score-item {
+                    text-align: center;
+                    padding: 10px;
+                    background-color: #636362;
+                    border-radius: 5px;
+                    flex-grow: 1;
+                    margin: 0 5px;
+                }
+                .details {
+                    display: none;
+                    padding: 10px;
+                }
+                .url-link {
+                    font-size: 1.2em;
+                }
+                .url-link span {
+                    margin-right: 10px;
+                }
+                .toggle-button {
+                    color: #333;
+                    border: none;
+                    padding: 5px 10px;
+                    text-align: center;
+                    text-decoration: none;
+                    display: inline-block;
+                    cursor: pointer;
+                }
+            </style>
+        </head>
+    """
     prev_idx = None
     combined_sentence = ""
+    total_score = 0
+    total_count = 0
+    category_scores = defaultdict(list)
+    for sentence, score, url, idx in sentence_scores:
+        category = check_url_category(url)
+        if score is None:
+            total_score += 0
+        else:
+            total_score += score
+        total_count += 1
+        category_scores[category].append(score)
         if idx != prev_idx and prev_idx is not None:
             color = color_map[prev_idx - 1]
+            index_part = f"<span>[{prev_idx}]</span>"
+            formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
             html_content += formatted_sentence
             combined_sentence = ""
         combined_sentence += " " + sentence
         prev_idx = idx
+    total_average_score = round(total_score / total_count, 2)
+    category_averages = {
+        category: round((sum(scores) / len(scores)), 2)
+        for category, scores in category_scores.items()
+    }
     if combined_sentence:
         color = color_map[prev_idx - 1]
+        index_part = ""
+        if prev_idx != -1:
+            index_part = f"<span>[{prev_idx}]</span>"
+        formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
         html_content += formatted_sentence
     html_content += "<hr>"
+    html_content += f"""
+        <div class="score-container">
+        <div class="score-item">
+            <h3>Overall Similarity</h3>
+            <p>{total_average_score}%</p>
+        </div>
+    """
+    for category, score in category_averages.items():
+        html_content += f"""
+            <div class="score-item"><h3>{category}</h3><p>{score}%</p></div>
+        """
+    html_content += "</div>"
+    for url, score, idx, sentence in url_scores:
+        url_category = check_url_category(url)
         color = color_map[idx - 1]
+        formatted_url = f"""
+            <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p>
+            <p> --- <b>Matching Score: </b>{score}%</p>
+            <p> --- <b>Original Source Content: </b>{sentence}</p>
+        """
+        # formatted_url = f"""
+        #     <div class="url-link">
+        #         <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p>
+        #         <a href="#" onclick="toggleDetails(event)" class="toggle-button">&gt;</a>
+        #     </div>
+        #     <div id="detailsContainer" class="details">
+        #         <p> --- <b>Matching Score: </b>{score}%</p>
+        #         <p> --- <b>Original Source Content: </b>{sentence}</p>
+        #     </div>
+        # """
         html_content += formatted_url
+    html_content += "</html>"
     print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
+    return html_content