Spaces:

typesdigital
/

PlagiarismCheck

Runtime error

App Files Files Community

typesdigital commited on Sep 10, 2023

Commit

9838f6d

1 Parent(s): 61aa933

Create app.py

Browse files

Files changed (1) hide show

app.py +45 -0

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import nltk
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+nltk.download('punkt')
+nltk.download('stopwords')
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+# Preprocess text
+def preprocess_text(text):
+    text = text.lower()  # Convert to lowercase
+    words = word_tokenize(text)  # Tokenize text
+    words = [word for word in words if word.isalnum()]  # Remove non-alphanumeric characters
+    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
+    return ' '.join(words)
+# Calculate text similarity using TF-IDF and cosine similarity
+def calculate_similarity(text1, text2):
+    preprocessed_text1 = preprocess_text(text1)
+    preprocessed_text2 = preprocess_text(text2)
+    tfidf_vectorizer = TfidfVectorizer()
+    tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2])
+    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
+# Replace 'text1' and 'text2' with the text you want to compare
+text1 = "This is the original text."
+text2 = "📣 Exciting news! 🚀 The Falcon 180B has landed, revolutionizing the world of open LLMs. 🦅 Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. 🤗 Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here 👉 Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! 🌟 #AI #MachineLearning #AmazonSageMaker."
+# Calculate text similarity
+similarity = calculate_similarity(text1, text2)
+# Set a threshold for plagiarism detection (adjust as needed)
+threshold = 0.8
+# Check if the similarity exceeds the threshold
+if similarity >= threshold:
+    print("Plagiarism detected!")
+else:
+    print("No plagiarism detected.")