Spaces:

GautamGaur
/

copyllm

Sleeping

GautamGaur commited on Oct 2, 2024

Commit

d49c2f7

verified ·

1 Parent(s): f0091b6

Upload 2 files

Files changed (2) hide show

app.py CHANGED Viewed

@@ -3,7 +3,19 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
 roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
@@ -19,6 +31,7 @@ class TextData(BaseModel):
 # Helper function to make predictions and convert to 0 (human) or 100 (AI)
 def predict_text(model, tokenizer, text):
     # Preprocess the text
     inputs = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

 from pydantic import BaseModel
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import nltk
+from nltk.corpus import stopwords
+import re
+import spacy
+nltk.download('stopwords')
+stop_words = set(stopwords.words('english'))
+def clean_text(text):
+    text = text.lower()  # Convert to lowercase
+    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
+    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
+    return text
 roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
 roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
 # Helper function to make predictions and convert to 0 (human) or 100 (AI)
 def predict_text(model, tokenizer, text):
+    text=clean_text(text)
     # Preprocess the text
     inputs = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

requirements.txt CHANGED Viewed

@@ -1,9 +1,14 @@
 annotated-types==0.7.0
 anyio==4.6.0
 certifi==2024.8.30
 charset-normalizer==3.3.2
 click==8.1.7
 colorama==0.4.6
 fastapi==0.115.0
 filelock==3.16.1
 fsspec==2024.9.0
@@ -12,27 +17,49 @@ httptools==0.6.1
 huggingface-hub==0.25.1
 idna==3.10
 Jinja2==3.1.4
 MarkupSafe==2.1.5
 mpmath==1.3.0
 networkx==3.3
-numpy==2.1.1
 packaging==24.1
 pydantic==2.9.2
 pydantic_core==2.23.4
 python-dotenv==1.0.1
 PyYAML==6.0.2
 regex==2024.9.11
 requests==2.32.3
 safetensors==0.4.5
 sniffio==1.3.1
 starlette==0.38.6
 sympy==1.13.3
 tokenizers==0.20.0
 torch==2.4.1
 tqdm==4.66.5
 transformers==4.45.1
 typing_extensions==4.12.2
 urllib3==2.2.3
 uvicorn==0.31.0
 watchfiles==0.24.0
 websockets==13.1

 annotated-types==0.7.0
 anyio==4.6.0
+blis==1.0.1
+catalogue==2.0.10
 certifi==2024.8.30
 charset-normalizer==3.3.2
 click==8.1.7
+cloudpathlib==0.19.0
 colorama==0.4.6
+confection==0.1.5
+cymem==2.0.8
 fastapi==0.115.0
 filelock==3.16.1
 fsspec==2024.9.0
 huggingface-hub==0.25.1
 idna==3.10
 Jinja2==3.1.4
+joblib==1.4.2
+langcodes==3.4.1
+language_data==1.2.0
+marisa-trie==1.2.0
+markdown-it-py==3.0.0
 MarkupSafe==2.1.5
+mdurl==0.1.2
 mpmath==1.3.0
+murmurhash==1.0.10
 networkx==3.3
+nltk==3.9.1
+numpy==2.0.2
 packaging==24.1
+preshed==3.0.9
 pydantic==2.9.2
 pydantic_core==2.23.4
+Pygments==2.18.0
 python-dotenv==1.0.1
 PyYAML==6.0.2
 regex==2024.9.11
 requests==2.32.3
+rich==13.9.1
 safetensors==0.4.5
+shellingham==1.5.4
+smart-open==7.0.4
 sniffio==1.3.1
+spacy==3.8.2
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
 starlette==0.38.6
 sympy==1.13.3
+thinc==8.3.2
 tokenizers==0.20.0
 torch==2.4.1
 tqdm==4.66.5
 transformers==4.45.1
+typer==0.12.5
 typing_extensions==4.12.2
 urllib3==2.2.3
 uvicorn==0.31.0
+wasabi==1.1.3
 watchfiles==0.24.0
+weasel==0.4.1
 websockets==13.1
+wrapt==1.16.0