idkash1 commited on
Commit
92eab78
·
verified ·
1 Parent(s): 80dcd43

Update human_text_detect.py

Browse files
Files changed (1) hide show
  1. human_text_detect.py +169 -169
human_text_detect.py CHANGED
@@ -1,169 +1,169 @@
1
- import torch
2
- import pandas as pd
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
- import logging
5
- import numpy as np
6
- import pickle
7
- from src.DetectLM import DetectLM
8
- from src.PerplexityEvaluator import PerplexityEvaluator
9
- from src.PrepareArticles import PrepareArticles #Idan
10
- from src.fit_survival_function import fit_per_length_survival_function
11
- from glob import glob
12
- import spacy
13
- import re
14
-
15
-
16
- logging.basicConfig(level=logging.INFO)
17
-
18
-
19
- def read_all_csv_files(pattern):
20
- df = pd.DataFrame()
21
- print(pattern)
22
- for f in glob(pattern):
23
- df = pd.concat([df, pd.read_csv(f)])
24
- return df
25
-
26
-
27
- def get_survival_function(df, G=101):
28
- """
29
- Returns a survival function for every sentence length in tokens.
30
-
31
- Args:
32
- :df: data frame with columns 'response' and 'length'
33
- :G: number of interpolation points
34
-
35
- Return:
36
- bivariate function (length, responce) -> (0,1)
37
-
38
- """
39
- assert not df.empty
40
- value_name = "response" if "response" in df.columns else "logloss"
41
-
42
- df1 = df[~df[value_name].isna()]
43
- ll = df1['length']
44
- xx1 = df1[value_name]
45
- return fit_per_length_survival_function(ll, xx1, log_space=True, G=G)
46
-
47
-
48
- def mark_edits_remove_tags(chunks, tag="edit"):
49
- text_chunks = chunks['text']
50
- edits = []
51
- for i,text in enumerate(text_chunks):
52
- chunk_text = re.findall(rf"<{tag}>(.+)</{tag}>", text)
53
- if len(chunk_text) > 0:
54
- import pdb; pdb.set_trace()
55
- chunks['text'][i] = chunk_text[0]
56
- chunks['length'][i] -= 2
57
- edits.append(True)
58
- else:
59
- edits.append(False)
60
-
61
- return chunks, edits
62
-
63
- def get_null_data(model_name, topic):
64
- data = None
65
- try:
66
- file = open(f'nullData/{model_name}_{topic}.pkl', 'rb')
67
- data = pickle.load(file)
68
- except:
69
- pass
70
-
71
- return data
72
-
73
- def get_threshold_obj(model_name, topic):
74
- threshold = None
75
- try:
76
- file = open('threshold_obj.pkl', 'rb')
77
- threshold_obj = pickle.load(file)
78
- threshold = threshold_obj[model_name][topic]
79
- except:
80
- pass
81
-
82
- return threshold
83
-
84
- def detect_human_text(model_name, topic, text):
85
-
86
- # Get null data
87
- print('Get null data')
88
- df_null = get_null_data(model_name, topic)
89
- if 'num' in df_null.columns:
90
- df_null = df_null[df_null.num > 1]
91
-
92
- # Get survival function
93
- print('Get survival function')
94
- pval_functions = get_survival_function(df_null, G=43)
95
-
96
- min_tokens_per_sentence = 10
97
- max_tokens_per_sentence = 100
98
-
99
- # Init model
100
- print('Init model')
101
- lm_name = 'gpt2-xl' if model_name == 'GPT2XL' else 'microsoft/phi-2'
102
- tokenizer = AutoTokenizer.from_pretrained(lm_name)
103
- model = AutoModelForCausalLM.from_pretrained(lm_name)
104
-
105
- print('Init PerplexityEvaluator')
106
- sentence_detector = PerplexityEvaluator(model, tokenizer)
107
-
108
- if torch.backends.mps.is_available():
109
- device = 'mps'
110
- elif torch.cuda.is_available():
111
- device = 'cuda'
112
- else:
113
- device = 'cpu'
114
-
115
- print(f'device {device}')
116
- model.to(device)
117
-
118
- print('Init DetectLM')
119
- detector = DetectLM(sentence_detector, pval_functions,
120
- min_len=min_tokens_per_sentence,
121
- max_len=max_tokens_per_sentence,
122
- length_limit_policy='truncate',
123
- HC_type='stbl',
124
- ignore_first_sentence= False
125
- )
126
-
127
- # Convert text to object
128
- print('Analyze text')
129
- article_obj = get_article_obj(text)
130
- parser = PrepareArticles(article_obj, min_tokens=min_tokens_per_sentence, max_tokens=max_tokens_per_sentence)
131
- chunks = parser(combined=False)
132
-
133
- # Go over all the document
134
- for i in range(len(chunks['text'])):
135
- print(chunks['text'][i])
136
- # for p,v in enumerate(chunks['text'][i]):
137
- # print(f'{p}: {v}')
138
- res = detector(chunks['text'][i], chunks['context'][i], dashboard=None)
139
-
140
- # print(f"Num of Edits (rate) = {np.sum(df['tag'] == '<edit>')} ({edit_rate})")
141
- # print(f"HC = {res['HC']}")
142
- # print(f"Fisher = {res['fisher']}")
143
- # print(f"Fisher (chisquared pvalue) = {res['fisher_pvalue']}")
144
-
145
- results = res['HC']
146
-
147
- threshold = get_threshold_obj(model_name, topic)
148
- print(f"threshold: {threshold}, results: {results}")
149
- return '1' if results >= threshold else '0'
150
-
151
- # Convert article text into object
152
- def get_article_obj(text):
153
- # Init article object
154
- article_obj = {
155
- 'sub_titles': [{
156
- 'sentences': []
157
- }]
158
- }
159
-
160
- nlp = spacy.load("en_core_web_sm") # Load model
161
-
162
- for line in text.split('\n'):
163
- doc = nlp(line) # Analyze text
164
- sentences = [sent.text for sent in doc.sents if len(sent) >= 10] # Split it by sentence
165
- for sentence in sentences:
166
- sentence = re.sub(r' +', ' ', sentence) # Remove duplicate spaces
167
- article_obj['sub_titles'][0]['sentences'].append({'sentence': sentence})
168
-
169
- return article_obj
 
1
+ import torch
2
+ import pandas as pd
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import logging
5
+ import numpy as np
6
+ import pickle
7
+ from src.DetectLM import DetectLM
8
+ from src.PerplexityEvaluator import PerplexityEvaluator
9
+ from src.PrepareArticles import PrepareArticles #Idan
10
+ from src.fit_survival_function import fit_per_length_survival_function
11
+ from glob import glob
12
+ import spacy
13
+ import re
14
+
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+
18
+
19
+ def read_all_csv_files(pattern):
20
+ df = pd.DataFrame()
21
+ print(pattern)
22
+ for f in glob(pattern):
23
+ df = pd.concat([df, pd.read_csv(f)])
24
+ return df
25
+
26
+
27
+ def get_survival_function(df, G=101):
28
+ """
29
+ Returns a survival function for every sentence length in tokens.
30
+
31
+ Args:
32
+ :df: data frame with columns 'response' and 'length'
33
+ :G: number of interpolation points
34
+
35
+ Return:
36
+ bivariate function (length, responce) -> (0,1)
37
+
38
+ """
39
+ assert not df.empty
40
+ value_name = "response" if "response" in df.columns else "logloss"
41
+
42
+ df1 = df[~df[value_name].isna()]
43
+ ll = df1['length']
44
+ xx1 = df1[value_name]
45
+ return fit_per_length_survival_function(ll, xx1, log_space=True, G=G)
46
+
47
+
48
+ def mark_edits_remove_tags(chunks, tag="edit"):
49
+ text_chunks = chunks['text']
50
+ edits = []
51
+ for i,text in enumerate(text_chunks):
52
+ chunk_text = re.findall(rf"<{tag}>(.+)</{tag}>", text)
53
+ if len(chunk_text) > 0:
54
+ import pdb; pdb.set_trace()
55
+ chunks['text'][i] = chunk_text[0]
56
+ chunks['length'][i] -= 2
57
+ edits.append(True)
58
+ else:
59
+ edits.append(False)
60
+
61
+ return chunks, edits
62
+
63
+ def get_null_data(model_name, topic):
64
+ data = None
65
+ try:
66
+ file = open(f'nullData/{model_name}_{topic}.pkl', 'rb')
67
+ data = pickle.load(file)
68
+ except:
69
+ pass
70
+
71
+ return data
72
+
73
+ def get_threshold_obj(model_name, topic):
74
+ threshold = None
75
+ try:
76
+ file = open('threshold_obj.pkl', 'rb')
77
+ threshold_obj = pickle.load(file)
78
+ threshold = threshold_obj[model_name][topic]
79
+ except:
80
+ pass
81
+
82
+ return threshold
83
+
84
+ def detect_human_text(model_name, topic, text):
85
+
86
+ # Get null data
87
+ print('Get null data')
88
+ df_null = get_null_data(model_name, topic)
89
+ if 'num' in df_null.columns:
90
+ df_null = df_null[df_null.num > 1]
91
+
92
+ # Get survival function
93
+ print('Get survival function')
94
+ pval_functions = get_survival_function(df_null, G=43)
95
+
96
+ min_tokens_per_sentence = 10
97
+ max_tokens_per_sentence = 100
98
+
99
+ # Init model
100
+ print('Init model')
101
+ lm_name = 'gpt2-xl' if model_name == 'GPT2XL' else 'microsoft/phi-2'
102
+ tokenizer = AutoTokenizer.from_pretrained(lm_name)
103
+ model = AutoModelForCausalLM.from_pretrained(lm_name)
104
+
105
+ print('Init PerplexityEvaluator')
106
+ sentence_detector = PerplexityEvaluator(model, tokenizer)
107
+
108
+ if torch.backends.mps.is_available():
109
+ device = 'mps'
110
+ elif torch.cuda.is_available():
111
+ device = 'cuda'
112
+ else:
113
+ device = 'cpu'
114
+
115
+ print(f'device {device}')
116
+ model.to(device)
117
+
118
+ print('Init DetectLM')
119
+ detector = DetectLM(sentence_detector, pval_functions,
120
+ min_len=min_tokens_per_sentence,
121
+ max_len=max_tokens_per_sentence,
122
+ length_limit_policy='truncate',
123
+ HC_type='stbl',
124
+ ignore_first_sentence= False
125
+ )
126
+
127
+ # Convert text to object
128
+ print('Analyze text')
129
+ article_obj = get_article_obj(text)
130
+ parser = PrepareArticles(article_obj, min_tokens=min_tokens_per_sentence, max_tokens=max_tokens_per_sentence)
131
+ chunks = parser(combined=False)
132
+
133
+ # Go over all the document
134
+ for i in range(len(chunks['text'])):
135
+ print(chunks['text'][i])
136
+ # for p,v in enumerate(chunks['text'][i]):
137
+ # print(f'{p}: {v}')
138
+ res = detector(chunks['text'][i], chunks['context'][i], dashboard=None)
139
+
140
+ # print(f"Num of Edits (rate) = {np.sum(df['tag'] == '<edit>')} ({edit_rate})")
141
+ # print(f"HC = {res['HC']}")
142
+ # print(f"Fisher = {res['fisher']}")
143
+ # print(f"Fisher (chisquared pvalue) = {res['fisher_pvalue']}")
144
+
145
+ results = res['HC']
146
+
147
+ threshold = get_threshold_obj(model_name, topic)
148
+ print(f"threshold: {threshold}, results: {results}")
149
+ return (results / threshold) - 1, res['sentences']
150
+
151
+ # Convert article text into object
152
+ def get_article_obj(text):
153
+ # Init article object
154
+ article_obj = {
155
+ 'sub_titles': [{
156
+ 'sentences': []
157
+ }]
158
+ }
159
+
160
+ nlp = spacy.load("en_core_web_sm") # Load model
161
+
162
+ for line in text.split('\n'):
163
+ doc = nlp(line) # Analyze text
164
+ sentences = [sent.text for sent in doc.sents if len(sent) >= 10] # Split it by sentence
165
+ for sentence in sentences:
166
+ sentence = re.sub(r' +', ' ', sentence) # Remove duplicate spaces
167
+ article_obj['sub_titles'][0]['sentences'].append({'sentence': sentence})
168
+
169
+ return article_obj