Update human_text_detect.py
Browse files- human_text_detect.py +27 -0
human_text_detect.py
CHANGED
@@ -11,6 +11,8 @@ from src.fit_survival_function import fit_per_length_survival_function
|
|
11 |
from glob import glob
|
12 |
import spacy
|
13 |
import re
|
|
|
|
|
14 |
|
15 |
|
16 |
logging.basicConfig(level=logging.INFO)
|
@@ -96,12 +98,37 @@ def detect_human_text(model_name, topic, text):
|
|
96 |
min_tokens_per_sentence = 10
|
97 |
max_tokens_per_sentence = 100
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
# Init model
|
100 |
print('Init model')
|
101 |
lm_name = 'gpt2-xl' if model_name == 'GPT2XL' else 'microsoft/phi-2'
|
102 |
cache_dir = "/cache/huggingface"
|
103 |
tokenizer = AutoTokenizer.from_pretrained(lm_name, cache_dir=cache_dir)
|
104 |
model = AutoModelForCausalLM.from_pretrained(lm_name, cache_dir=cache_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
print('Init PerplexityEvaluator')
|
107 |
sentence_detector = PerplexityEvaluator(model, tokenizer)
|
|
|
11 |
from glob import glob
|
12 |
import spacy
|
13 |
import re
|
14 |
+
import os
|
15 |
+
from transformers.utils import logging
|
16 |
|
17 |
|
18 |
logging.basicConfig(level=logging.INFO)
|
|
|
98 |
min_tokens_per_sentence = 10
|
99 |
max_tokens_per_sentence = 100
|
100 |
|
101 |
+
####
|
102 |
+
cache_dir = "/cache/huggingface"
|
103 |
+
# Check if the directory exists and is writable
|
104 |
+
print(f"Cache directory exists: {os.path.exists(cache_dir)}")
|
105 |
+
print(f"Cache directory is writable: {os.access(cache_dir, os.W_OK)}")
|
106 |
+
|
107 |
+
# List contents of the directory
|
108 |
+
print("Contents of cache directory before loading model:")
|
109 |
+
os.system(f"ls -lah {cache_dir}")
|
110 |
+
###
|
111 |
+
|
112 |
# Init model
|
113 |
print('Init model')
|
114 |
lm_name = 'gpt2-xl' if model_name == 'GPT2XL' else 'microsoft/phi-2'
|
115 |
cache_dir = "/cache/huggingface"
|
116 |
tokenizer = AutoTokenizer.from_pretrained(lm_name, cache_dir=cache_dir)
|
117 |
model = AutoModelForCausalLM.from_pretrained(lm_name, cache_dir=cache_dir)
|
118 |
+
|
119 |
+
###
|
120 |
+
print("Contents of cache directory after loading model:")
|
121 |
+
os.system(f"ls -lah {cache_dir}")
|
122 |
+
|
123 |
+
logging.set_verbosity_info()
|
124 |
+
|
125 |
+
print(f"Current HF_HOME: {os.getenv('HF_HOME')}")
|
126 |
+
print(f"Current TRANSFORMERS_CACHE: {os.getenv('TRANSFORMERS_CACHE')}")
|
127 |
+
|
128 |
+
# Check where the tokenizer and model are actually downloaded
|
129 |
+
print(f"Tokenizer saved at: {tokenizer.save_pretrained(cache_dir)}")
|
130 |
+
print(f"Model saved at: {model.save_pretrained(cache_dir)}")
|
131 |
+
###
|
132 |
|
133 |
print('Init PerplexityEvaluator')
|
134 |
sentence_detector = PerplexityEvaluator(model, tokenizer)
|