Spaces:

Kashtan
/

Detect_Edits_in_AI-Generated_Text

Runtime error

App Files Files Community

Detect_Edits_in_AI-Generated_Text / src /PrepareSentenceContext.py

Kashtan

Upload 44 files

7a0ff7a verified 5 months ago

raw

history blame contribute delete

7.1 kB

	import logging
	import spacy
	import re
	import numpy as np
	from src.SentenceParser import SentenceParser

	class PrepareSentenceContext(object):
	"""
	Parse text and extract length and context information

	This information is needed for evaluating log-perplexity of the text with respect to a language model
	and later on to test the likelihood that the sentence was sampled from the model with the relevant context.
	"""

	def __init__(self, sentence_parser='spacy', context_policy=None, context=None):
	if sentence_parser == 'spacy':
	self.nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer", "ner"])
	if sentence_parser == 'regex':
	logging.warning("Regex-based parser is not good at breaking sentences like 'Dr. Stone', etc.")
	self.nlp = SentenceParser()

	self.sentence_parser_name = sentence_parser

	self.context_policy = context_policy
	self.context = context

	def __call__(self, text):
	return self.parse_sentences(text)

	def parse_sentences(self, text):
	pattern_close = r"(.*?)</edit>"
	pattern_open = r"<edit>(.*?)"
	MIN_TOKEN_LEN = 3

	texts = []
	tags = []
	lengths = []
	contexts = []

	def update_sent(sent_text, tag, sent_length):
	texts.append(sent_text)
	tags.append(tag)
	lengths.append(sent_length)
	if self.context is not None:
	context = self.context
	elif self.context_policy is None:
	context = None
	elif self.context_policy == 'previous_sentence' and len(texts) > 0:
	context = texts[-1]
	else:
	context = None
	contexts.append(context)

	curr_tag = None
	parsed = self.nlp(text)
	for s in parsed.sents:
	prev_tag = curr_tag
	matches_close = re.findall(pattern_close, s.text)
	matches_open = re.findall(pattern_open, s.text)
	matches_between = re.findall(r"<edit>(.*?)</edit>", s.text)

	logging.debug(f"Current sentence: {s.text}")
	logging.debug(f"Matches open: {matches_open}")
	logging.debug(f"Matches close: {matches_close}")
	logging.debug(f"Matches between: {matches_between}")
	if len(matches_close)>0 and len(matches_open)>0:
	logging.debug("Found an opening and a closing tag in the same sentence.")
	if prev_tag is None and len(matches_open[0]) >= MIN_TOKEN_LEN:
	logging.debug("Openning followed by closing with some text in between.")
	update_sent(matches_open[0], "<edit>", len(s)-2)
	curr_tag = None
	if prev_tag == "<edit>" and len(matches_close[0]) >= MIN_TOKEN_LEN:
	logging.warning(f"Wierd case: closing/openning followed by openning in sentence {len(texts)}")
	update_sent(matches_close[0], prev_tag, len(s)-1)
	curr_tag = None
	if prev_tag == "</edit>":
	logging.debug("Closing followed by openning.")
	curr_tag = "<edit>"
	if len(matches_between[0]) > MIN_TOKEN_LEN:
	update_sent(matches_between[0], None, len(s)-2)
	elif len(matches_open) > 0:
	curr_tag = "<edit>"
	assert prev_tag is None, f"Found an opening tag without a closing tag in sentence num. {len(texts)}"
	if len(matches_open[0]) >= MIN_TOKEN_LEN:
	# text and tag are in the same sentence
	sent_text = matches_open[0]
	update_sent(sent_text, curr_tag, len(s)-1)
	elif len(matches_close) > 0:
	curr_tag = "</edit>"
	assert prev_tag == "<edit>", f"Found a closing tag without an opening tag in sentence num. {len(texts)}"
	if len(matches_close[0]) >= MIN_TOKEN_LEN:
	# text and tag are in the same sentence
	update_sent(matches_close[0], prev_tag, len(s)-1)
	curr_tag = None
	else:
	#if len(matches_close)==0 and len(matches_open)==0:
	# no tag
	update_sent(s.text, curr_tag, len(s))
	return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags,
	'number_in_par': np.arange(1,1+len(texts))}

	def REMOVE_parse_sentences(self, text):
	texts = []
	contexts = []
	lengths = []
	tags = []
	num_in_par = []
	previous = None

	text = re.sub("(</?[a-zA-Z0-9 ]+>\.?)\s+", r"\1.\n", text) # to make sure that tags are in separate sentences
	#text = re.sub("(</[a-zA-Z0-9 ]+>\.?)\s+", r"\n\1.\n", text) # to make sure that tags are in separate sentences

	parsed = self.nlp(text)

	running_sent_num = 0
	curr_tag = None
	for i, sent in enumerate(parsed.sents):
	# Here we try to track HTML-like tags. There might be
	# some issues because spacy sentence parser has unexpected behavior when it comes to newlines
	all_tags = re.findall(r"(</?[a-zA-Z0-9 ]+>)", str(sent))
	if len(all_tags) > 1:
	logging.error(f"More than one tag in sentence {i}: {all_tags}")
	exit(1)
	if len(all_tags) == 1:
	tag = all_tags[0]
	if tag[:2] == '</': # a closing tag
	if curr_tag is None:
	logging.warning(f"Closing tag without an opening tag in sentence {i}: {sent}")
	else:
	curr_tag = None
	else:
	if curr_tag is not None:
	logging.warning(f"Opening tag without a closing tag in sentence {i}: {sent}")
	else:
	curr_tag = tag
	else: # if text is not a tag
	sent_text = str(sent)
	sent_length = len(sent)

	texts.append(sent_text)
	running_sent_num += 1
	num_in_par.append(running_sent_num)
	tags.append(curr_tag)
	lengths.append(sent_length)

	if self.context is not None:
	context = self.context
	elif self.context_policy is None:
	context = None
	elif self.context_policy == 'previous_sentence':
	context = previous
	previous = sent_text
	else:
	context = None

	contexts.append(context)
	return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags,
	'number_in_par': num_in_par}