Spaces:

PirateXX
/

AI-Content-Detector-From-PDF

Runtime error

File size: 3,430 Bytes

3ced2ed
 
f7bc9f1
 
 
3ced2ed
 
 
 
3dfc25f
42d7fda
3ced2ed
 
 
f7bc9f1
 
 
 
 
 
 
3ced2ed
 
 
 
29d8541
3ced2ed
29d8541
 
3ced2ed
 
 
 
 
4d9c03f
 
 
 
3ced2ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29d8541
3ced2ed
 
 
 
 
 
4d9c03f
3ced2ed
 
4d9c03f
 
3ced2ed
 
c6a55b0
66f5dd3
c43109e
 
 
 
 
 
 
 
 
 
5f78e9a
c43109e
3ced2ed
 
 
 
 
 
 
 
 
94abe9f
3ced2ed

from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import RobertaConfig
from torch import cuda
import torch
import gradio as gr
import os
import re
import pdfplumber

app = Flask(__name__)

ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
# config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
# model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)

# model_name = "roberta-base"
# tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))
tokenizer = AutoTokenizer.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN)
model = AutoModelForSequenceClassification.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN)


# function to break text into an array of sentences
def text_to_sentences(text):
    return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text)

# function to concatenate sentences into chunks of size 900 or less
def chunks_of_900(text, chunk_size=900):
    sentences = text_to_sentences(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) <= chunk_size:
            if len(current_chunk)!=0:
                current_chunk += " "+sentence
            else:
                current_chunk += sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    chunks.append(current_chunk)
    return chunks
    
def predict(query, device="cpu"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

def findRealProb(text):
    chunksOfText = (chunks_of_900(text))
    results = []
    for chunk in chunksOfText:
        output = predict(chunk)
        results.append([output, len(chunk)])
    
    ans = 0
    cnt=0
    for prob, length in results:
        ans = ans + prob*length
        cnt+=length
    realProb = ans/cnt
    return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}

def upload_file(file):
   
    if file:
        pdf_file = file.name
        text = ""
        with pdfplumber.open(pdf_file) as pdf:
            cnt = 0
            for page in pdf.pages:
                cnt+=1
                text+=(page.extract_text(x_tolerance = 1))
                if cnt>5: 
                    break
            text = text.replace('\n', ' ')
            return findRealProb(text)
    else:
        return {"error":'No PDF file found in request'}


demo = gr.Interface(
        fn=upload_file, 
        inputs=gr.File(), 
         article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
        outputs=gr.outputs.JSON(),
        interpretation="default",)

demo.launch(show_api=False)