PlagiarismCheck / app.py
typesdigital's picture
Create app.py
9838f6d
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Preprocess text
def preprocess_text(text):
text = text.lower() # Convert to lowercase
words = word_tokenize(text) # Tokenize text
words = [word for word in words if word.isalnum()] # Remove non-alphanumeric characters
words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords
return ' '.join(words)
# Calculate text similarity using TF-IDF and cosine similarity
def calculate_similarity(text1, text2):
preprocessed_text1 = preprocess_text(text1)
preprocessed_text2 = preprocess_text(text2)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2])
return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
# Replace 'text1' and 'text2' with the text you want to compare
text1 = "This is the original text."
text2 = "πŸ“£ Exciting news! πŸš€ The Falcon 180B has landed, revolutionizing the world of open LLMs. πŸ¦… Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. πŸ€— Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here πŸ‘‰ Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! 🌟 #AI #MachineLearning #AmazonSageMaker."
# Calculate text similarity
similarity = calculate_similarity(text1, text2)
# Set a threshold for plagiarism detection (adjust as needed)
threshold = 0.8
# Check if the similarity exceeds the threshold
if similarity >= threshold:
print("Plagiarism detected!")
else:
print("No plagiarism detected.")