yilunzhang commited on
Commit
f83b59c
·
unverified ·
1 Parent(s): b0f0a1a

initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +4 -0
  2. app.py +29 -0
  3. requirements.txt +2 -0
  4. utils.py +42 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .DS_Store
2
+ .vscode
3
+
4
+ __pycache__/
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import pipeline
4
+
5
+ from utils import clean_text
6
+
7
+
8
+ pipeline = pipeline(
9
+ task="text-classification",
10
+ model="fakespotailabs/roberta-base-ai-text-detection-v1",
11
+ device="cuda" if torch.cuda.is_available() else "cpu"
12
+ )
13
+
14
+
15
+ def predict(text):
16
+ cleaned_text = clean_text(text)
17
+ predictions = pipeline(cleaned_text, return_all_scores=True)[0]
18
+ return {
19
+ p["label"]: p["score"] for p in predictions
20
+ }
21
+
22
+
23
+ demo = gr.Interface(
24
+ predict,
25
+ inputs=gr.Textbox(),
26
+ outputs=gr.Label(num_top_classes=2),
27
+ title="AI Text Detector"
28
+ )
29
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ torch
utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from html import unescape
3
+
4
+
5
+ def clean_text(t):
6
+ t = clean_markdown(t)
7
+ t = t.replace("\n"," ")
8
+ t = t.replace("\t"," ")
9
+ t = t.replace("^M"," ")
10
+ t = t.replace("\r"," ")
11
+ t = t.replace(" ,", ",")
12
+ t = re.sub(" +", " ", t)
13
+ return t
14
+
15
+
16
+ def clean_markdown(md_text):
17
+ # Remove code blocks
18
+ md_text = re.sub(r'```.*?```', '', md_text, flags=re.DOTALL)
19
+ # Remove inline code
20
+ md_text = re.sub(r'`[^`]*`', '', md_text)
21
+ # Remove images
22
+ md_text = re.sub(r'!\[.*?\]\(.*?\)', '', md_text)
23
+ # Remove links but keep link text
24
+ md_text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', md_text)
25
+ # Remove bold and italic (groups of *, _)
26
+ md_text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', md_text)
27
+ md_text = re.sub(r'(\*|_)(.*?)\1', r'\2', md_text)
28
+ # Remove headings
29
+ md_text = re.sub(r'#+ ', '', md_text)
30
+ # Remove blockquotes
31
+ md_text = re.sub(r'^>.*$', '', md_text, flags=re.MULTILINE)
32
+ # Remove list markers
33
+ md_text = re.sub(r'^(\s*[-*+]|\d+\.)\s+', '', md_text, flags=re.MULTILINE)
34
+ # Remove horizontal rules
35
+ md_text = re.sub(r'^\s*[-*_]{3,}\s*$', '', md_text, flags=re.MULTILINE)
36
+ # Remove tables
37
+ md_text = re.sub(r'\|.*?\|', '', md_text)
38
+ # Remove raw HTML tags
39
+ md_text = re.sub(r'<.*?>', '', md_text)
40
+ # Decode HTML entities
41
+ md_text = unescape(md_text)
42
+ return md_text