Neha13 commited on
Commit
285d2df
·
verified ·
1 Parent(s): c0e781a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
3
+ import torch
4
+ import nltk
5
+ from nltk.util import ngrams
6
+ from nltk.probability import FreqDist
7
+ import plotly.express as px
8
+ import torch.nn.functional as F
9
+ from collections import Counter
10
+ from nltk.corpus import stopwords
11
+ import string
12
+
13
+ import nltk
14
+ nltk.download('punkt')
15
+ nltk.download('stopwords')
16
+ # Initialize tokenizer and model
17
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
18
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
19
+
20
+ def c_perplexity(text):
21
+ """Calculate the perplexity of the given text using GPT-2."""
22
+ if not text.strip():
23
+ return float('inf') # Return inf for empty input
24
+
25
+ input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
26
+ if input_ids.size(1) == 0: # Check for empty input after encoding
27
+ return float('inf')
28
+
29
+ with torch.no_grad():
30
+ outputs = model(input_ids)
31
+ logits = outputs.logits
32
+
33
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))
34
+ perplexity = torch.exp(loss)
35
+ return perplexity.item()
36
+
37
+ def c_burstiness(text):
38
+ """Calculate the burstiness of the given text."""
39
+ tokens = nltk.word_tokenize(text.lower())
40
+ if not tokens:
41
+ return 0.0
42
+
43
+ word_freq = FreqDist(tokens)
44
+ repeated_count = sum(count > 1 for count in word_freq.values())
45
+ b_score = repeated_count / len(word_freq) if len(word_freq) > 0 else 0.0
46
+ return b_score
47
+
48
+ def top_repword_count(text):
49
+ """Generate a bar chart of the top 10 most repeated words."""
50
+ tokens = nltk.word_tokenize(text.lower())
51
+ stop_words = set(stopwords.words('english'))
52
+ tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
53
+
54
+ word_counts = Counter(tokens)
55
+ top_words = word_counts.most_common(10)
56
+
57
+ if not top_words:
58
+ st.write("No significant words found.")
59
+ return
60
+
61
+ words, counts = zip(*top_words)
62
+ fig = px.bar(x=words, y=counts, labels={'x': 'Words', 'y': 'Counts'}, title="Top 10 Most Repeated Words in the Text")
63
+ st.plotly_chart(fig, user_container_width=True)
64
+
65
+ # Streamlit app configuration
66
+ st.set_page_config(layout="wide")
67
+
68
+ st.title("AI Content Detector")
69
+
70
+ text_area = st.text_area("Enter your text here!")
71
+
72
+ if text_area:
73
+ if st.button("Analyse the content"):
74
+ col1, col2, col3 = st.columns([1, 2, 1])
75
+
76
+ with col1:
77
+ st.info("Your input text")
78
+ st.success(text_area)
79
+
80
+ with col2:
81
+ st.info("Your output score")
82
+ perplexity = c_perplexity(text_area)
83
+ burstiness = c_burstiness(text_area)
84
+
85
+ st.success(f"Perplexity score: {perplexity}")
86
+ st.success(f"Burstiness score: {burstiness}")
87
+
88
+ if perplexity > 40000 or burstiness < 0.24:
89
+ st.error("Result: The text is likely AI-generated.")
90
+ else:
91
+ st.success("Result: The text is not AI-generated.")
92
+
93
+ st.warning("Disclaimer: AI plagiarism detector apps can assist in identifying potential instances of plagiarism.")
94
+
95
+ with col3:
96
+ st.info("Basic Review")
97
+ top_repword_count(text_area)