Spaces:

typesdigital
/

PlagiarismCheck

Runtime error

App Files Files Community

PlagiarismCheck / app.py

typesdigital

Create app.py

9838f6d almost 2 years ago

raw

history blame contribute delete

1.99 kB

	import nltk
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	nltk.download('punkt')
	nltk.download('stopwords')

	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords

	# Preprocess text
	def preprocess_text(text):
	text = text.lower() # Convert to lowercase
	words = word_tokenize(text) # Tokenize text
	words = [word for word in words if word.isalnum()] # Remove non-alphanumeric characters
	words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords
	return ' '.join(words)

	# Calculate text similarity using TF-IDF and cosine similarity
	def calculate_similarity(text1, text2):
	preprocessed_text1 = preprocess_text(text1)
	preprocessed_text2 = preprocess_text(text2)

	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2])

	return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

	# Replace 'text1' and 'text2' with the text you want to compare
	text1 = "This is the original text."
	text2 = "📣 Exciting news! 🚀 The Falcon 180B has landed, revolutionizing the world of open LLMs. 🦅 Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. 🤗 Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here 👉 Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! 🌟 #AI #MachineLearning #AmazonSageMaker."

	# Calculate text similarity
	similarity = calculate_similarity(text1, text2)

	# Set a threshold for plagiarism detection (adjust as needed)
	threshold = 0.8

	# Check if the similarity exceeds the threshold
	if similarity >= threshold:
	print("Plagiarism detected!")
	else:
	print("No plagiarism detected.")