minko186 commited on
Commit
24a982b
·
2 Parent(s): 4b92a71 ca46543

Merge branch 'main' into staging + fixed url_content error

Browse files
Files changed (3) hide show
  1. ai_generate.py +22 -18
  2. app.py +10 -10
  3. humanize.py +859 -103
ai_generate.py CHANGED
@@ -42,27 +42,35 @@ vertexai.init(project="proprietary-info-detection", location="us-central1")
42
  gemini_client = GenerativeModel("gemini-1.5-pro-001")
43
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
44
 
 
 
 
45
 
46
  rag_llms = {
47
  "LLaMA 3": ChatGroq(
48
- temperature=0,
 
49
  model_name="llama3-70b-8192",
50
  ),
51
  "OpenAI GPT 4o Mini": ChatOpenAI(
52
- temperature=0,
 
53
  model_name="gpt-4o-mini",
54
  ),
55
  "OpenAI GPT 4o": ChatOpenAI(
56
- temperature=0,
 
57
  model_name="gpt-4o",
58
  ),
59
  "OpenAI GPT 4": ChatOpenAI(
60
- temperature=0,
 
61
  model_name="gpt-4-turbo",
62
  ),
63
- "Gemini 1.5 Pro": ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-pro"),
64
  "Claude Sonnet 3.5": ChatAnthropic(
65
- temperature=0,
 
66
  model_name="claude-3-5-sonnet-20240620",
67
  ),
68
  }
@@ -107,8 +115,8 @@ def generate_groq(text, model):
107
  "content": "Please follow the instruction and write about the given topic in approximately the given number of words",
108
  },
109
  ],
110
- temperature=1,
111
- max_tokens=1024,
112
  stream=True,
113
  stop=None,
114
  )
@@ -124,8 +132,8 @@ def generate_openai(text, model, openai_client):
124
  response = openai_client.chat.completions.create(
125
  model=model,
126
  messages=message,
127
- temperature=1,
128
- max_tokens=1024,
129
  )
130
  return response.choices[0].message.content
131
 
@@ -138,9 +146,8 @@ def generate_gemini(text, model, gemini_client):
138
  generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
139
  }
140
  generation_config = {
141
- "max_output_tokens": 1024,
142
- "temperature": 1.0,
143
- "top_p": 1.0,
144
  }
145
  response = gemini_client.generate_content(
146
  [text],
@@ -154,8 +161,8 @@ def generate_gemini(text, model, gemini_client):
154
  def generate_claude(text, model, claude_client):
155
  response = claude_client.messages.create(
156
  model=model,
157
- max_tokens=1024,
158
- temperature=1.0,
159
  system="You are helpful assistant.",
160
  messages=[{"role": "user", "content": [{"type": "text", "text": text}]}],
161
  )
@@ -165,9 +172,6 @@ def generate_claude(text, model, claude_client):
165
  def generate(text, model, path, api=None):
166
  if path:
167
  result = generate_rag(text, model, path)
168
- if "references" not in result.lower():
169
- result += "\n\n" + "References:"
170
- result += "\n\n" + f"{path}"
171
  return result
172
  else:
173
  print(f"Generating text for {model}...")
 
42
  gemini_client = GenerativeModel("gemini-1.5-pro-001")
43
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
44
 
45
+ # For GPT-4 1 word is about 1.3 tokens.
46
+ temperature = 1.0
47
+ max_tokens = 2048
48
 
49
  rag_llms = {
50
  "LLaMA 3": ChatGroq(
51
+ temperature=temperature,
52
+ max_tokens=max_tokens,
53
  model_name="llama3-70b-8192",
54
  ),
55
  "OpenAI GPT 4o Mini": ChatOpenAI(
56
+ temperature=temperature,
57
+ max_tokens=max_tokens,
58
  model_name="gpt-4o-mini",
59
  ),
60
  "OpenAI GPT 4o": ChatOpenAI(
61
+ temperature=temperature,
62
+ max_tokens=max_tokens,
63
  model_name="gpt-4o",
64
  ),
65
  "OpenAI GPT 4": ChatOpenAI(
66
+ temperature=temperature,
67
+ max_tokens=max_tokens,
68
  model_name="gpt-4-turbo",
69
  ),
70
+ "Gemini 1.5 Pro": ChatGoogleGenerativeAI(temperature=temperature, max_tokens=max_tokens, model="gemini-1.5-pro"),
71
  "Claude Sonnet 3.5": ChatAnthropic(
72
+ temperature=temperature,
73
+ max_tokens=max_tokens,
74
  model_name="claude-3-5-sonnet-20240620",
75
  ),
76
  }
 
115
  "content": "Please follow the instruction and write about the given topic in approximately the given number of words",
116
  },
117
  ],
118
+ temperature=temperature,
119
+ max_tokens=max_tokens,
120
  stream=True,
121
  stop=None,
122
  )
 
132
  response = openai_client.chat.completions.create(
133
  model=model,
134
  messages=message,
135
+ temperature=temperature,
136
+ max_tokens=max_tokens,
137
  )
138
  return response.choices[0].message.content
139
 
 
146
  generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
147
  }
148
  generation_config = {
149
+ "max_output_tokens": max_tokens,
150
+ "temperature": temperature,
 
151
  }
152
  response = gemini_client.generate_content(
153
  [text],
 
161
  def generate_claude(text, model, claude_client):
162
  response = claude_client.messages.create(
163
  model=model,
164
+ max_tokens=max_tokens,
165
+ temperature=temperature,
166
  system="You are helpful assistant.",
167
  messages=[{"role": "user", "content": [{"type": "text", "text": text}]}],
168
  )
 
172
  def generate(text, model, path, api=None):
173
  if path:
174
  result = generate_rag(text, model, path)
 
 
 
175
  return result
176
  else:
177
  print(f"Generating text for {model}...")
app.py CHANGED
@@ -2,20 +2,18 @@
2
  nohup python3 app.py &
3
  """
4
 
5
- import openai
6
- import gradio as gr
7
- from typing import Dict, List
8
  import re
9
  import requests
 
 
 
 
 
10
  import language_tool_python
 
11
  import torch
12
- from gradio_client import Client
13
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
14
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
15
- from scipy.special import softmax
16
- from collections import defaultdict
17
- import nltk
18
- from datetime import date
19
 
20
  from utils import remove_special_characters
21
  from plagiarism import google_search, months, domain_list, build_date
@@ -37,6 +35,9 @@ tokenizers = {
37
  "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
38
  }
39
 
 
 
 
40
 
41
  # Function to move model to the appropriate device
42
  def to_device(model):
@@ -101,7 +102,6 @@ def ends_with_references(text):
101
 
102
 
103
  def format_and_correct_language_check(text: str) -> str:
104
- tool = language_tool_python.LanguageTool("en-US")
105
  return tool.correct(text)
106
 
107
 
@@ -627,7 +627,7 @@ def create_interface():
627
  )
628
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
629
  with gr.Row():
630
- google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=True)
631
  with gr.Group(visible=True) as search_options:
632
  with gr.Row():
633
  include_sites = gr.Textbox(
 
2
  nohup python3 app.py &
3
  """
4
 
 
 
 
5
  import re
6
  import requests
7
+ from typing import Dict
8
+ from collections import defaultdict
9
+ from datetime import date
10
+ import gradio as gr
11
+ from scipy.special import softmax
12
  import language_tool_python
13
+ import nltk
14
  import torch
 
15
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
16
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 
 
 
 
17
 
18
  from utils import remove_special_characters
19
  from plagiarism import google_search, months, domain_list, build_date
 
35
  "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
36
  }
37
 
38
+ # grammar correction tool
39
+ tool = language_tool_python.LanguageTool("en-US")
40
+
41
 
42
  # Function to move model to the appropriate device
43
  def to_device(model):
 
102
 
103
 
104
  def format_and_correct_language_check(text: str) -> str:
 
105
  return tool.correct(text)
106
 
107
 
 
627
  )
628
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
629
  with gr.Row():
630
+ google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=False)
631
  with gr.Group(visible=True) as search_options:
632
  with gr.Row():
633
  include_sites = gr.Textbox(
humanize.py CHANGED
@@ -1,108 +1,864 @@
1
- import torch
2
- from nltk import sent_tokenize
3
- import nltk
4
- from tqdm import tqdm
 
 
 
 
 
5
  import gradio as gr
6
- from transformers import T5ForConditionalGeneration, T5Tokenizer
7
-
8
- nltk.download("punkt")
9
- # autodetect the available device
10
- GPU_IDX = 1 # which GPU to use
11
- if torch.cuda.is_available():
12
- num_gpus = torch.cuda.device_count()
13
- print(f"Number of available GPUs: {num_gpus}")
14
- assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
15
- device = torch.device(f"cuda:{GPU_IDX}")
16
- print(f"Using GPU: {GPU_IDX}")
17
- else:
18
- print("CUDA is not available. Using CPU instead.")
19
- device = torch.device("cpu")
20
-
21
-
22
- # Configuration for models and their adapters
23
- model_config = {
24
- "Base Model": "polygraf-ai/poly-humanizer-base",
25
- "Large Model": "polygraf-ai/poly-humanizer-large",
26
- "XL Model": {
27
- "path": "google/flan-t5-xl",
28
- "adapters": {
29
- "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
30
- # "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
31
- # "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
32
- # "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
33
- },
34
- },
35
  }
36
 
37
- # cache the base models, tokenizers, and adapters
38
- models, tokenizers = {}, {}
39
- for name, config in model_config.items():
40
- path = config if isinstance(config, str) else config["path"]
41
- # initialize model and tokenizer
42
- model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
43
- models[name] = model
44
- tokenizers[name] = T5Tokenizer.from_pretrained(path)
45
- # load all avalable adapters, each being additional roughly 150M parameters
46
- if isinstance(config, dict) and "adapters" in config:
47
- for adapter_name, adapter_path in config["adapters"].items():
48
- model.load_adapter(adapter_path, adapter_name=adapter_name)
49
- print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
50
-
51
-
52
- def paraphrase_text(
53
- text,
54
- progress=gr.Progress(),
55
- model_name="Base Model",
56
- temperature=1.2,
57
- repetition_penalty=1.0,
58
- top_k=50,
59
- length_penalty=1.0,
60
- ):
61
- progress(0, desc="Starting to Humanize")
62
- progress(0.05)
63
- # select the model, tokenizer and adapter
64
- if "XL" in model_name: # dynamic adapter load/unload for XL models
65
- # all adapter models use the XL model as the base
66
- tokenizer, model = tokenizers["XL Model"], models["XL Model"]
67
- # set the adapter if it's not already set
68
- if model.active_adapters() != [f"{model_name} Adapter"]:
69
- model.set_adapter(f"{model_name} Adapter")
70
- print(f"Using adapter: {model_name} Adapter")
71
- else:
72
- tokenizer = tokenizers[model_name]
73
- model = models[model_name]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- # Split the text into paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  paragraphs = text.split("\n")
77
- humanized_paragraphs = []
78
-
79
- for paragraph in progress.tqdm(paragraphs, desc="Humanizing"):
80
- # paraphrase each chunk of text
81
- sentences = sent_tokenize(paragraph)
82
- paraphrases = []
83
- for sentence in sentences:
84
- sentence = sentence.strip()
85
- if len(sentence) == 0:
86
- continue
87
- inputs = tokenizer(
88
- "Please paraphrase this sentence: " + sentence,
89
- return_tensors="pt",
90
- ).to(device)
91
- outputs = model.generate(
92
- **inputs,
93
- do_sample=True,
94
- temperature=temperature,
95
- repetition_penalty=repetition_penalty,
96
- max_length=128,
97
- top_k=top_k,
98
- length_penalty=length_penalty,
99
- )
100
- paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
101
- paraphrases.append(paraphrased_sentence)
102
- print(f"\nOriginal: {sentence}")
103
- print(f"Paraphrased: {paraphrased_sentence}")
104
- combined_paraphrase = " ".join(paraphrases)
105
- humanized_paragraphs.append(combined_paraphrase)
106
-
107
- humanized_text = "\n".join(humanized_paragraphs)
108
- return humanized_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ nohup python3 app.py &
3
+ """
4
+
5
+ import re
6
+ import requests
7
+ from typing import Dict
8
+ from collections import defaultdict
9
+ from datetime import date
10
  import gradio as gr
11
+ from scipy.special import softmax
12
+ import language_tool_python
13
+ import nltk
14
+ import torch
15
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
16
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
17
+
18
+ from utils import remove_special_characters
19
+ from plagiarism import google_search, months, domain_list, build_date
20
+ from humanize import paraphrase_text, device
21
+ from ai_generate import generate
22
+
23
+ print(f"Using device: {device}")
24
+
25
+ models = {
26
+ "Polygraf AI (Base Model)": AutoModelForSequenceClassification.from_pretrained(
27
+ "polygraf-ai/bc-roberta-openai-2sent"
28
+ ).to(device),
29
+ "Polygraf AI (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
30
+ "polygraf-ai/bc_combined_3sent"
31
+ ).to(device),
32
+ }
33
+ tokenizers = {
34
+ "Polygraf AI (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
35
+ "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
 
 
 
 
36
  }
37
 
38
+ # grammar correction tool
39
+ tool = language_tool_python.LanguageTool("en-US")
40
+
41
+
42
+ # Function to move model to the appropriate device
43
+ def to_device(model):
44
+ return model.to(device)
45
+
46
+
47
+ def copy_to_input(text):
48
+ return text
49
+
50
+
51
+ def remove_bracketed_numbers(text):
52
+ pattern = r"^\[\d+\]"
53
+ cleaned_text = re.sub(pattern, "", text)
54
+ return cleaned_text
55
+
56
+
57
+ def clean_text(text: str) -> str:
58
+ paragraphs = text.split("\n\n")
59
+ cleaned_paragraphs = []
60
+ for paragraph in paragraphs:
61
+ cleaned = re.sub(r"\s+", " ", paragraph).strip()
62
+ cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
63
+ cleaned_paragraphs.append(cleaned)
64
+ return "\n".join(cleaned_paragraphs)
65
+
66
+
67
+ def split_text_from_refs(text: str, sep="\n"):
68
+ lines = text.split("\n")
69
+ references = []
70
+ article_text = []
71
+ index_pattern = re.compile(r"\[(\d+)\]")
72
+ in_references = False
73
+
74
+ for line in lines:
75
+ if line.strip().lower() == "references" or line.strip().lower() == "references:":
76
+ in_references = True
77
+ continue
78
+ if line.strip().lower().startswith("references:"):
79
+ in_references = True
80
+ if in_references:
81
+ matches = index_pattern.split(line)
82
+ for match in matches:
83
+ if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
84
+ references.append(match.strip())
85
+ else:
86
+ article_text.append(line)
87
+
88
+ formatted_refs = []
89
+ for i, ref in enumerate(references, 1):
90
+ ref = remove_bracketed_numbers(ref)
91
+ formatted_refs.append(f"[{i}] {ref}{sep}")
92
+
93
+ return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
94
+
95
+
96
+ def ends_with_references(text):
97
+ # Define a regular expression pattern for variations of "References:"
98
+ pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
99
+
100
+ # Check if the text ends with any form of "References:"
101
+ return bool(pattern.search(text.strip()))
102
 
103
+
104
+ def format_and_correct_language_check(text: str) -> str:
105
+ return tool.correct(text)
106
+
107
+
108
+ def predict(model, tokenizer, text):
109
+ text = remove_special_characters(text)
110
+ bc_token_size = 256
111
+ with torch.no_grad():
112
+ model.eval()
113
+ tokens = tokenizer(
114
+ text,
115
+ padding="max_length",
116
+ truncation=True,
117
+ max_length=bc_token_size,
118
+ return_tensors="pt",
119
+ ).to(device)
120
+ output = model(**tokens)
121
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
122
+ output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
123
+ return output_norm
124
+
125
+
126
+ def ai_generated_test(text, model="BC Original"):
127
+ return predict(models[model], tokenizers[model], text)
128
+
129
+
130
+ def detection_polygraf(text, model="BC Original"):
131
+ # sentences = split_into_sentences(text)
132
+ sentences = nltk.sent_tokenize(text)
133
+ num_sentences = len(sentences)
134
+ scores = defaultdict(list)
135
+
136
+ overall_scores = []
137
+
138
+ # Process each chunk of 3 sentences and store the score for each sentence in the chunk
139
+ for i in range(num_sentences):
140
+ chunk = " ".join(sentences[i : i + 3])
141
+ if chunk:
142
+ # result = classifier(chunk)
143
+ result = ai_generated_test(chunk, model)
144
+ score = result["AI"]
145
+ for j in range(i, min(i + 3, num_sentences)):
146
+ scores[j].append(score)
147
+
148
+ # Calculate the average score for each sentence and apply color coding
149
  paragraphs = text.split("\n")
150
+ paragraphs = [s for s in paragraphs if s.strip()]
151
+ colored_paragraphs = []
152
+ i = 0
153
+ for paragraph in paragraphs:
154
+ temp_sentences = nltk.sent_tokenize(paragraph)
155
+ colored_sentences = []
156
+ for sentence in temp_sentences:
157
+ if scores[i]:
158
+ avg_score = sum(scores[i]) / len(scores[i])
159
+ if avg_score >= 0.65:
160
+ colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
161
+ else:
162
+ colored_sentence = sentence
163
+ colored_sentences.append(colored_sentence)
164
+ overall_scores.append(avg_score)
165
+ i = i + 1
166
+ combined_sentences = " ".join(colored_sentences)
167
+ colored_paragraphs.append(combined_sentences)
168
+
169
+ overall_score = sum(overall_scores) / len(overall_scores)
170
+ overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
171
+ return overall_score, "<br><br>".join(colored_paragraphs)
172
+
173
+
174
+ ai_check_options = [
175
+ "Polygraf AI (Base Model)",
176
+ "Polygraf AI (Advanced Model)",
177
+ ]
178
+
179
+
180
+ def ai_generated_test_sapling(text: str) -> Dict:
181
+ response = requests.post(
182
+ "https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
183
+ )
184
+ return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
185
+
186
+
187
+ class GPT2PPL:
188
+ def __init__(self):
189
+ self.device = device
190
+ self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
191
+ self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
192
+
193
+ def __call__(self, text):
194
+ encodings = self.tokenizer(text, return_tensors="pt")
195
+ encodings = {k: v.to(self.device) for k, v in encodings.items()}
196
+ max_length = self.model.config.n_positions
197
+ stride = 512
198
+ seq_len = encodings.input_ids.size(1)
199
+
200
+ nlls = []
201
+ for i in range(0, seq_len, stride):
202
+ begin_loc = max(i + stride - max_length, 0)
203
+ end_loc = min(i + stride, seq_len)
204
+ trg_len = end_loc - i
205
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
206
+ target_ids = input_ids.clone()
207
+ target_ids[:, :-trg_len] = -100
208
+
209
+ with torch.no_grad():
210
+ outputs = self.model(input_ids, labels=target_ids)
211
+ neg_log_likelihood = outputs.loss * trg_len
212
+
213
+ nlls.append(neg_log_likelihood)
214
+
215
+ ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
216
+ return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
217
+
218
+
219
+ def ai_generated_test_gptzero(text):
220
+ gptzero_model = GPT2PPL()
221
+ result = gptzero_model(text)
222
+ return result, None
223
+
224
+
225
+ def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
226
+ body, references = split_text_from_refs(text, "<br>")
227
+ score, text = detection_polygraf(text=body, model=model)
228
+ text = text + "<br>" + references
229
+ return score, text
230
+
231
+
232
+ def ai_check(text: str, option: str):
233
+ if option.startswith("Polygraf AI"):
234
+ return highlighter_polygraf(text, option)
235
+ elif option == "Sapling AI":
236
+ return ai_generated_test_sapling(text)
237
+ elif option == "GPTZero":
238
+ return ai_generated_test_gptzero(text)
239
+ else:
240
+ return highlighter_polygraf(text, option)
241
+
242
+
243
+ def generate_prompt(settings: Dict[str, str]) -> str:
244
+ prompt = f"""
245
+ I am a {settings['role']}
246
+ Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
247
+
248
+ Style and Tone:
249
+ - Writing style: {settings['writing_style']}
250
+ - Tone: {settings['tone']}
251
+ - Target audience: {settings['user_category']}
252
+
253
+ Content:
254
+ - Depth: {settings['depth_of_content']}
255
+ - Structure: {', '.join(settings['structure'])}
256
+
257
+ Keywords to incorporate:
258
+ {', '.join(settings['keywords'])}
259
+
260
+ Additional requirements:
261
+ - Don't start with "Here is a...", start with the requested text directly
262
+ - Include {settings['num_examples']} relevant examples or case studies
263
+ - Incorporate data or statistics from {', '.join(settings['references'])}
264
+ - End with a {settings['conclusion_type']} conclusion
265
+ - Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
266
+ - Do not make any headline, title bold.
267
+ {settings['sources']}
268
+
269
+ Ensure proper paragraph breaks for better readability.
270
+ Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
271
+ """
272
+ return prompt
273
+
274
+
275
+ def regenerate_prompt(settings: Dict[str, str]) -> str:
276
+ prompt = f"""
277
+ I am a {settings['role']}
278
+ "{settings['generated_article']}"
279
+ Edit the given text based on user comments.
280
+
281
+ Comments:
282
+ - Don't start with "Here is a...", start with the requested text directly
283
+ - {settings['user_comments']}
284
+ - The original content should not be changed. Make minor modifications based on user comments above.
285
+ - Keep the references the same as the given text in the same format.
286
+ - Do not make any headline, title bold.
287
+ {settings['sources']}
288
+
289
+ Ensure proper paragraph breaks for better readability.
290
+ Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
291
+ """
292
+ return prompt
293
+
294
+
295
+ def generate_article(
296
+ input_role: str,
297
+ topic: str,
298
+ keywords: str,
299
+ article_length: str,
300
+ format: str,
301
+ writing_style: str,
302
+ tone: str,
303
+ user_category: str,
304
+ depth_of_content: str,
305
+ structure: str,
306
+ references: str,
307
+ num_examples: str,
308
+ conclusion_type: str,
309
+ ai_model: str,
310
+ content_string: str,
311
+ # api_key: str = None,
312
+ pdf_file_input=None,
313
+ generated_article: str = None,
314
+ user_comments: str = None,
315
+ ) -> str:
316
+ settings = {
317
+ "role": input_role,
318
+ "topic": topic,
319
+ "keywords": [k.strip() for k in keywords.split(",")],
320
+ "article_length": article_length,
321
+ "format": format,
322
+ "writing_style": writing_style,
323
+ "tone": tone,
324
+ "user_category": user_category,
325
+ "depth_of_content": depth_of_content,
326
+ "structure": [s.strip() for s in structure.split(",")],
327
+ "references": [r.strip() for r in references.split(",")],
328
+ "num_examples": num_examples,
329
+ "conclusion_type": conclusion_type,
330
+ "sources": content_string,
331
+ "generated_article": generated_article,
332
+ "user_comments": user_comments,
333
+ }
334
+
335
+ if generated_article:
336
+ prompt = regenerate_prompt(settings)
337
+ else:
338
+ prompt = generate_prompt(settings)
339
+
340
+ print("Generated Prompt...\n", prompt)
341
+ article = generate(
342
+ prompt,
343
+ ai_model,
344
+ pdf_file_input, # api_key
345
+ )
346
+
347
+ return clean_text(article)
348
+
349
+
350
+ def humanize(
351
+ text: str,
352
+ model: str,
353
+ temperature: float = 1.2,
354
+ repetition_penalty: float = 1,
355
+ top_k: int = 50,
356
+ length_penalty: float = 1,
357
+ ) -> str:
358
+ body, references = split_text_from_refs(text)
359
+ result = paraphrase_text(
360
+ text=body,
361
+ model_name=model,
362
+ temperature=temperature,
363
+ repetition_penalty=repetition_penalty,
364
+ top_k=top_k,
365
+ length_penalty=length_penalty,
366
+ )
367
+ result = result + "\n\n" + references
368
+ return format_and_correct_language_check(result)
369
+
370
+
371
+ def update_visibility_api(model: str):
372
+ if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
373
+ return gr.update(visible=True)
374
+ else:
375
+ return gr.update(visible=False)
376
+
377
+
378
+ def format_references(text: str) -> str:
379
+ lines = text.split("\n")
380
+ references = []
381
+ article_text = []
382
+ index_pattern = re.compile(r"\[(\d+)\]")
383
+ in_references = False
384
+
385
+ for line in lines:
386
+ if line.strip().lower() == "references" or line.strip().lower() == "references:":
387
+ in_references = True
388
+ continue
389
+ if line.strip().lower().startswith("references:"):
390
+ in_references = True
391
+ if in_references:
392
+ matches = index_pattern.split(line)
393
+ for match in matches:
394
+ if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
395
+ references.append(match.strip())
396
+ else:
397
+ article_text.append(line)
398
+
399
+ formatted_refs = []
400
+ for i, ref in enumerate(references, 1):
401
+ ref = remove_bracketed_numbers(ref)
402
+ formatted_refs.append(f"[{i}] {ref}\n")
403
+
404
+ return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
405
+
406
+
407
+ def generate_and_format(
408
+ input_role,
409
+ topic,
410
+ keywords,
411
+ article_length,
412
+ format,
413
+ writing_style,
414
+ tone,
415
+ user_category,
416
+ depth_of_content,
417
+ structure,
418
+ references,
419
+ num_examples,
420
+ conclusion_type,
421
+ ai_model,
422
+ # api_key,
423
+ google_search_check,
424
+ year_from,
425
+ month_from,
426
+ day_from,
427
+ year_to,
428
+ month_to,
429
+ day_to,
430
+ domains_to_include,
431
+ include_sites,
432
+ exclude_sites,
433
+ pdf_file_input,
434
+ generated_article: str = None,
435
+ user_comments: str = None,
436
+ ):
437
+ content_string = ""
438
+ url_content = None
439
+ if google_search_check:
440
+ date_from = build_date(year_from, month_from, day_from)
441
+ date_to = build_date(year_to, month_to, day_to)
442
+ sorted_date = f"date:r:{date_from}:{date_to}"
443
+ final_query = topic
444
+ if include_sites:
445
+ site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
446
+ final_query += " " + " OR ".join(site_queries)
447
+ if exclude_sites:
448
+ exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
449
+ final_query += " " + " ".join(exclude_queries)
450
+ print(f"Google Search Query: {final_query}")
451
+ url_content = google_search(final_query, sorted_date, domains_to_include)
452
+ content_string = "\n".join(
453
+ f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
454
+ )
455
+ content_string = (
456
+ "Use the trusted information here from the URLs and add them as References:\n" + content_string
457
+ )
458
+ article = generate_article(
459
+ input_role,
460
+ topic,
461
+ keywords,
462
+ article_length,
463
+ format,
464
+ writing_style,
465
+ tone,
466
+ user_category,
467
+ depth_of_content,
468
+ structure,
469
+ references,
470
+ num_examples,
471
+ conclusion_type,
472
+ ai_model,
473
+ content_string,
474
+ # api_key,
475
+ pdf_file_input,
476
+ generated_article,
477
+ user_comments,
478
+ )
479
+ if ends_with_references(article) and url_content is not None:
480
+ for url in url_content.keys():
481
+ article += f"\n{url}"
482
+
483
+ return format_references(article)
484
+
485
+
486
+ def create_interface():
487
+ with gr.Blocks(
488
+ theme=gr.themes.Default(
489
+ primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray
490
+ ),
491
+ css="""
492
+ .input-highlight-pink block_label {background-color: #008080}
493
+ """,
494
+ ) as demo:
495
+ today = date.today()
496
+ # dd/mm/YY
497
+ d1 = today.strftime("%d/%B/%Y")
498
+ d1 = d1.split("/")
499
+ gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
500
+
501
+ with gr.Row():
502
+ with gr.Column(scale=2):
503
+ with gr.Group():
504
+ gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
505
+ input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student")
506
+ input_topic = gr.Textbox(
507
+ label="Topic",
508
+ placeholder="Enter the main topic of your article",
509
+ elem_classes="input-highlight-pink",
510
+ )
511
+ input_keywords = gr.Textbox(
512
+ label="Keywords",
513
+ placeholder="Enter comma-separated keywords",
514
+ elem_classes="input-highlight-yellow",
515
+ )
516
+
517
+ with gr.Row():
518
+ input_format = gr.Dropdown(
519
+ choices=[
520
+ "Article",
521
+ "Essay",
522
+ "Blog post",
523
+ "Report",
524
+ "Research paper",
525
+ "News article",
526
+ "White paper",
527
+ "LinkedIn post",
528
+ "X (Twitter) post",
529
+ "Instagram Video Content",
530
+ "TikTok Video Content",
531
+ "Facebook post",
532
+ ],
533
+ value="Article",
534
+ label="Format",
535
+ elem_classes="input-highlight-turquoise",
536
+ )
537
+
538
+ input_length = gr.Slider(
539
+ minimum=50,
540
+ maximum=5000,
541
+ step=50,
542
+ value=300,
543
+ label="Article Length",
544
+ elem_classes="input-highlight-pink",
545
+ )
546
+
547
+ with gr.Row():
548
+ input_writing_style = gr.Dropdown(
549
+ choices=[
550
+ "Formal",
551
+ "Informal",
552
+ "Technical",
553
+ "Conversational",
554
+ "Journalistic",
555
+ "Academic",
556
+ "Creative",
557
+ ],
558
+ value="Formal",
559
+ label="Writing Style",
560
+ elem_classes="input-highlight-yellow",
561
+ )
562
+ input_tone = gr.Dropdown(
563
+ choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"],
564
+ value="Professional",
565
+ label="Tone",
566
+ elem_classes="input-highlight-turquoise",
567
+ )
568
+
569
+ input_user_category = gr.Dropdown(
570
+ choices=[
571
+ "Students",
572
+ "Professionals",
573
+ "Researchers",
574
+ "General Public",
575
+ "Policymakers",
576
+ "Entrepreneurs",
577
+ ],
578
+ value="General Public",
579
+ label="Target Audience",
580
+ elem_classes="input-highlight-pink",
581
+ )
582
+ input_depth = gr.Dropdown(
583
+ choices=[
584
+ "Surface-level overview",
585
+ "Moderate analysis",
586
+ "In-depth research",
587
+ "Comprehensive study",
588
+ ],
589
+ value="Moderate analysis",
590
+ label="Depth of Content",
591
+ elem_classes="input-highlight-yellow",
592
+ )
593
+ input_structure = gr.Dropdown(
594
+ choices=[
595
+ "Introduction, Body, Conclusion",
596
+ "Abstract, Introduction, Methods, Results, Discussion, Conclusion",
597
+ "Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion",
598
+ "Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion",
599
+ ],
600
+ value="Introduction, Body, Conclusion",
601
+ label="Structure",
602
+ elem_classes="input-highlight-turquoise",
603
+ )
604
+ input_references = gr.Dropdown(
605
+ choices=[
606
+ "Academic journals",
607
+ "Industry reports",
608
+ "Government publications",
609
+ "News outlets",
610
+ "Expert interviews",
611
+ "Case studies",
612
+ ],
613
+ value="News outlets",
614
+ label="References",
615
+ elem_classes="input-highlight-pink",
616
+ )
617
+ input_num_examples = gr.Dropdown(
618
+ choices=["1-2", "3-4", "5+"],
619
+ value="1-2",
620
+ label="Number of Examples/Case Studies",
621
+ elem_classes="input-highlight-yellow",
622
+ )
623
+ input_conclusion = gr.Dropdown(
624
+ choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
625
+ value="Call to Action",
626
+ label="Conclusion Type",
627
+ elem_classes="input-highlight-turquoise",
628
+ )
629
+ gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
630
+ with gr.Row():
631
+ google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=False)
632
+ with gr.Group(visible=True) as search_options:
633
+ with gr.Row():
634
+ include_sites = gr.Textbox(
635
+ label="Include Specific Websites",
636
+ placeholder="Enter comma-separated keywords",
637
+ elem_classes="input-highlight-yellow",
638
+ )
639
+ with gr.Row():
640
+ exclude_sites = gr.Textbox(
641
+ label="Exclude Specific Websites",
642
+ placeholder="Enter comma-separated keywords",
643
+ elem_classes="input-highlight-yellow",
644
+ )
645
+ with gr.Row():
646
+ domains_to_include = gr.Dropdown(
647
+ domain_list,
648
+ value=domain_list,
649
+ multiselect=True,
650
+ label="Domains To Include",
651
+ )
652
+ with gr.Row():
653
+ month_from = gr.Dropdown(
654
+ choices=months,
655
+ label="From Month",
656
+ value="January",
657
+ interactive=True,
658
+ )
659
+ day_from = gr.Textbox(label="From Day", value="01")
660
+ year_from = gr.Textbox(label="From Year", value="2000")
661
+
662
+ with gr.Row():
663
+ month_to = gr.Dropdown(
664
+ choices=months,
665
+ label="To Month",
666
+ value=d1[1],
667
+ interactive=True,
668
+ )
669
+ day_to = gr.Textbox(label="To Day", value=d1[0])
670
+ year_to = gr.Textbox(label="To Year", value=d1[2])
671
+
672
+ gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
673
+ pdf_file_input = gr.File(label="Upload PDF")
674
+
675
+ with gr.Group():
676
+ gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
677
+ ai_generator = gr.Dropdown(
678
+ choices=[
679
+ "OpenAI GPT 4",
680
+ "OpenAI GPT 4o",
681
+ "OpenAI GPT 4o Mini",
682
+ "Claude Sonnet 3.5",
683
+ "Gemini 1.5 Pro",
684
+ "LLaMA 3",
685
+ ],
686
+ value="OpenAI GPT 4o Mini",
687
+ label="AI Model",
688
+ elem_classes="input-highlight-pink",
689
+ )
690
+ # input_api = gr.Textbox(label="API Key", visible=False)
691
+ # ai_generator.change(update_visibility_api, ai_generator, input_api)
692
+
693
+ generate_btn = gr.Button("Generate Article", variant="primary")
694
+
695
+ with gr.Accordion("Advanced Humanizer Settings", open=False):
696
+ with gr.Row():
697
+ model_dropdown = gr.Radio(
698
+ choices=[
699
+ "Base Model",
700
+ "Large Model",
701
+ "XL Model",
702
+ # "XL Law Model",
703
+ # "XL Marketing Model",
704
+ # "XL Child Style Model",
705
+ ],
706
+ value="Large Model",
707
+ label="Humanizer Model Version",
708
+ )
709
+ with gr.Row():
710
+ temperature_slider = gr.Slider(
711
+ minimum=0.5, maximum=2.0, step=0.1, value=1.3, label="Temperature"
712
+ )
713
+ top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=50, label="Top k")
714
+ with gr.Row():
715
+ repetition_penalty_slider = gr.Slider(
716
+ minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
717
+ )
718
+ length_penalty_slider = gr.Slider(
719
+ minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
720
+ )
721
+
722
+ with gr.Column(scale=3):
723
+ output_article = gr.Textbox(label="Generated Article", lines=20)
724
+ ai_comments = gr.Textbox(
725
+ label="Add comments to help edit generated text", interactive=True, visible=False
726
+ )
727
+ regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
728
+ ai_detector_dropdown = gr.Radio(
729
+ choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
730
+ )
731
+ ai_check_btn = gr.Button("AI Check")
732
+
733
+ with gr.Accordion("AI Detection Results", open=True):
734
+ ai_check_result = gr.Label(label="AI Check Result")
735
+ highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
736
+ humanize_btn = gr.Button("Humanize")
737
+ # humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
738
+ humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
739
+ copy_to_input_btn = gr.Button("Copy to Input for AI Check")
740
+
741
+ def regenerate_visible(text):
742
+ if text:
743
+ return gr.update(visible=True)
744
+ else:
745
+ return gr.update(visible=False)
746
+
747
+ def highlight_visible(text):
748
+ if text.startswith("Polygraf"):
749
+ return gr.update(visible=True)
750
+ else:
751
+ return gr.update(visible=False)
752
+
753
+ def search_visible(toggle):
754
+ if toggle:
755
+ return gr.update(visible=True)
756
+ else:
757
+ return gr.update(visible=False)
758
+
759
+ google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
760
+ ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
761
+ output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
762
+ ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
763
+ ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
764
+
765
+ generate_btn.click(
766
+ fn=generate_and_format,
767
+ inputs=[
768
+ input_role,
769
+ input_topic,
770
+ input_keywords,
771
+ input_length,
772
+ input_format,
773
+ input_writing_style,
774
+ input_tone,
775
+ input_user_category,
776
+ input_depth,
777
+ input_structure,
778
+ input_references,
779
+ input_num_examples,
780
+ input_conclusion,
781
+ ai_generator,
782
+ # input_api,
783
+ google_search_check,
784
+ year_from,
785
+ month_from,
786
+ day_from,
787
+ year_to,
788
+ month_to,
789
+ day_to,
790
+ domains_to_include,
791
+ include_sites,
792
+ exclude_sites,
793
+ pdf_file_input,
794
+ ],
795
+ outputs=[output_article],
796
+ )
797
+
798
+ regenerate_btn.click(
799
+ fn=generate_and_format,
800
+ inputs=[
801
+ input_role,
802
+ input_topic,
803
+ input_keywords,
804
+ input_length,
805
+ input_format,
806
+ input_writing_style,
807
+ input_tone,
808
+ input_user_category,
809
+ input_depth,
810
+ input_structure,
811
+ input_references,
812
+ input_num_examples,
813
+ input_conclusion,
814
+ ai_generator,
815
+ # input_api,
816
+ google_search_check,
817
+ year_from,
818
+ month_from,
819
+ day_from,
820
+ year_to,
821
+ month_to,
822
+ day_to,
823
+ domains_to_include,
824
+ pdf_file_input,
825
+ output_article,
826
+ include_sites,
827
+ exclude_sites,
828
+ ai_comments,
829
+ ],
830
+ outputs=[output_article],
831
+ )
832
+
833
+ ai_check_btn.click(
834
+ fn=ai_check,
835
+ inputs=[output_article, ai_detector_dropdown],
836
+ outputs=[ai_check_result, highlighted_text],
837
+ )
838
+
839
+ humanize_btn.click(
840
+ fn=humanize,
841
+ inputs=[
842
+ output_article,
843
+ model_dropdown,
844
+ temperature_slider,
845
+ repetition_penalty_slider,
846
+ top_k_slider,
847
+ length_penalty_slider,
848
+ ],
849
+ outputs=[humanized_output],
850
+ )
851
+
852
+ copy_to_input_btn.click(
853
+ fn=copy_to_input,
854
+ inputs=[humanized_output],
855
+ outputs=[output_article],
856
+ )
857
+
858
+ return demo
859
+
860
+
861
+ if __name__ == "__main__":
862
+ demo = create_interface()
863
+ # demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
864
+ demo.launch(server_name="0.0.0.0")