Spaces:
Runtime error
Runtime error
import nltk | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
# Preprocess text | |
def preprocess_text(text): | |
text = text.lower() # Convert to lowercase | |
words = word_tokenize(text) # Tokenize text | |
words = [word for word in words if word.isalnum()] # Remove non-alphanumeric characters | |
words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords | |
return ' '.join(words) | |
# Calculate text similarity using TF-IDF and cosine similarity | |
def calculate_similarity(text1, text2): | |
preprocessed_text1 = preprocess_text(text1) | |
preprocessed_text2 = preprocess_text(text2) | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2]) | |
return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] | |
# Replace 'text1' and 'text2' with the text you want to compare | |
text1 = "This is the original text." | |
text2 = "π£ Exciting news! π The Falcon 180B has landed, revolutionizing the world of open LLMs. π¦ Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. π€ Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here π Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! π #AI #MachineLearning #AmazonSageMaker." | |
# Calculate text similarity | |
similarity = calculate_similarity(text1, text2) | |
# Set a threshold for plagiarism detection (adjust as needed) | |
threshold = 0.8 | |
# Check if the similarity exceeds the threshold | |
if similarity >= threshold: | |
print("Plagiarism detected!") | |
else: | |
print("No plagiarism detected.") | |