Spaces:
Runtime error
Runtime error
File size: 3,430 Bytes
3ced2ed f7bc9f1 3ced2ed 3dfc25f 42d7fda 3ced2ed f7bc9f1 3ced2ed 29d8541 3ced2ed 29d8541 3ced2ed 4d9c03f 3ced2ed 29d8541 3ced2ed 4d9c03f 3ced2ed 4d9c03f 3ced2ed c6a55b0 66f5dd3 c43109e 5f78e9a c43109e 3ced2ed 94abe9f 3ced2ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import RobertaConfig
from torch import cuda
import torch
import gradio as gr
import os
import re
import pdfplumber
app = Flask(__name__)
ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
# config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
# model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)
# model_name = "roberta-base"
# tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))
tokenizer = AutoTokenizer.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN)
model = AutoModelForSequenceClassification.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN)
# function to break text into an array of sentences
def text_to_sentences(text):
return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text)
# function to concatenate sentences into chunks of size 900 or less
def chunks_of_900(text, chunk_size=900):
sentences = text_to_sentences(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk + sentence) <= chunk_size:
if len(current_chunk)!=0:
current_chunk += " "+sentence
else:
current_chunk += sentence
else:
chunks.append(current_chunk)
current_chunk = sentence
chunks.append(current_chunk)
return chunks
def predict(query, device="cpu"):
tokens = tokenizer.encode(query)
all_tokens = len(tokens)
tokens = tokens[:tokenizer.model_max_length - 2]
used_tokens = len(tokens)
tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
mask = torch.ones_like(tokens)
with torch.no_grad():
logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
probs = logits.softmax(dim=-1)
fake, real = probs.detach().cpu().flatten().numpy().tolist()
return real
def findRealProb(text):
chunksOfText = (chunks_of_900(text))
results = []
for chunk in chunksOfText:
output = predict(chunk)
results.append([output, len(chunk)])
ans = 0
cnt=0
for prob, length in results:
ans = ans + prob*length
cnt+=length
realProb = ans/cnt
return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}
def upload_file(file):
if file:
pdf_file = file.name
text = ""
with pdfplumber.open(pdf_file) as pdf:
cnt = 0
for page in pdf.pages:
cnt+=1
text+=(page.extract_text(x_tolerance = 1))
if cnt>5:
break
text = text.replace('\n', ' ')
return findRealProb(text)
else:
return {"error":'No PDF file found in request'}
demo = gr.Interface(
fn=upload_file,
inputs=gr.File(),
article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
outputs=gr.outputs.JSON(),
interpretation="default",)
demo.launch(show_api=False) |