typesdigital commited on
Commit
9838f6d
Β·
1 Parent(s): 61aa933

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -0
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ nltk.download('punkt')
7
+ nltk.download('stopwords')
8
+
9
+ from nltk.tokenize import word_tokenize
10
+ from nltk.corpus import stopwords
11
+
12
+ # Preprocess text
13
+ def preprocess_text(text):
14
+ text = text.lower() # Convert to lowercase
15
+ words = word_tokenize(text) # Tokenize text
16
+ words = [word for word in words if word.isalnum()] # Remove non-alphanumeric characters
17
+ words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords
18
+ return ' '.join(words)
19
+
20
+ # Calculate text similarity using TF-IDF and cosine similarity
21
+ def calculate_similarity(text1, text2):
22
+ preprocessed_text1 = preprocess_text(text1)
23
+ preprocessed_text2 = preprocess_text(text2)
24
+
25
+ tfidf_vectorizer = TfidfVectorizer()
26
+ tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2])
27
+
28
+ return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
29
+
30
+ # Replace 'text1' and 'text2' with the text you want to compare
31
+ text1 = "This is the original text."
32
+ text2 = "πŸ“£ Exciting news! πŸš€ The Falcon 180B has landed, revolutionizing the world of open LLMs. πŸ¦… Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. πŸ€— Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here πŸ‘‰ Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! 🌟 #AI #MachineLearning #AmazonSageMaker."
33
+
34
+ # Calculate text similarity
35
+ similarity = calculate_similarity(text1, text2)
36
+
37
+ # Set a threshold for plagiarism detection (adjust as needed)
38
+ threshold = 0.8
39
+
40
+ # Check if the similarity exceeds the threshold
41
+ if similarity >= threshold:
42
+ print("Plagiarism detected!")
43
+ else:
44
+ print("No plagiarism detected.")
45
+