minko186 commited on
Commit
4b92a71
·
1 Parent(s): 38c42ed

add rag for all models, optimized references-related functions

Browse files
Files changed (4) hide show
  1. ai_generate.py +51 -26
  2. app.py +77 -68
  3. plagiarism.py +5 -3
  4. requirements.txt +5 -1
ai_generate.py CHANGED
@@ -21,6 +21,9 @@ from langchain_core.output_parsers import StrOutputParser
21
  from langchain_core.runnables import RunnablePassthrough
22
  from langchain.chains import RetrievalQA
23
  from langchain_groq import ChatGroq
 
 
 
24
  from dotenv import load_dotenv
25
 
26
  load_dotenv()
@@ -40,6 +43,30 @@ gemini_client = GenerativeModel("gemini-1.5-pro-001")
40
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def create_db_with_langchain(path):
45
  loader = PyMuPDFLoader(path)
@@ -56,11 +83,9 @@ def create_db_with_langchain(path):
56
  return db
57
 
58
 
59
- def generate_groq_rag(text, model, path):
60
- llm = ChatGroq(
61
- temperature=0,
62
- model_name=model,
63
- )
64
  db = create_db_with_langchain(path)
65
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20})
66
  prompt = hub.pull("rlm/rag-prompt")
@@ -72,7 +97,7 @@ def generate_groq_rag(text, model, path):
72
  return rag_chain.invoke(text).content
73
 
74
 
75
- def generate_groq_base(text, model):
76
  completion = groq_client.chat.completions.create(
77
  model=model,
78
  messages=[
@@ -94,13 +119,6 @@ def generate_groq_base(text, model):
94
  return response
95
 
96
 
97
- def generate_groq(text, model, path):
98
- if path:
99
- return generate_groq_rag(text, model, path)
100
- else:
101
- return generate_groq_base(text, model)
102
-
103
-
104
  def generate_openai(text, model, openai_client):
105
  message = [{"role": "user", "content": text}]
106
  response = openai_client.chat.completions.create(
@@ -145,16 +163,23 @@ def generate_claude(text, model, claude_client):
145
 
146
 
147
  def generate(text, model, path, api=None):
148
-
149
- if model == "LLaMA 3":
150
- return generate_groq(text, "llama3-70b-8192", path)
151
- elif model == "OpenAI GPT 4o Mini":
152
- return generate_openai(text, "gpt-4o-mini", openai_client)
153
- elif model == "OpenAI GPT 4o":
154
- return generate_openai(text, "gpt-4o", openai_client)
155
- elif model == "OpenAI GPT 4":
156
- return generate_openai(text, "gpt-4-turbo", openai_client)
157
- elif model == "Gemini 1.5 Pro":
158
- return generate_gemini(text, "", gemini_client)
159
- elif model == "Claude Sonnet 3.5":
160
- return generate_claude(text, "claude-3-5-sonnet-20240620", claude_client)
 
 
 
 
 
 
 
 
21
  from langchain_core.runnables import RunnablePassthrough
22
  from langchain.chains import RetrievalQA
23
  from langchain_groq import ChatGroq
24
+ from langchain_openai import ChatOpenAI
25
+ from langchain_google_genai import ChatGoogleGenerativeAI
26
+ from langchain_anthropic import ChatAnthropic
27
  from dotenv import load_dotenv
28
 
29
  load_dotenv()
 
43
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
44
 
45
 
46
+ rag_llms = {
47
+ "LLaMA 3": ChatGroq(
48
+ temperature=0,
49
+ model_name="llama3-70b-8192",
50
+ ),
51
+ "OpenAI GPT 4o Mini": ChatOpenAI(
52
+ temperature=0,
53
+ model_name="gpt-4o-mini",
54
+ ),
55
+ "OpenAI GPT 4o": ChatOpenAI(
56
+ temperature=0,
57
+ model_name="gpt-4o",
58
+ ),
59
+ "OpenAI GPT 4": ChatOpenAI(
60
+ temperature=0,
61
+ model_name="gpt-4-turbo",
62
+ ),
63
+ "Gemini 1.5 Pro": ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-pro"),
64
+ "Claude Sonnet 3.5": ChatAnthropic(
65
+ temperature=0,
66
+ model_name="claude-3-5-sonnet-20240620",
67
+ ),
68
+ }
69
+
70
 
71
  def create_db_with_langchain(path):
72
  loader = PyMuPDFLoader(path)
 
83
  return db
84
 
85
 
86
+ def generate_rag(text, model, path):
87
+ print(f"Generating text using RAG for {model}...")
88
+ llm = rag_llms[model]
 
 
89
  db = create_db_with_langchain(path)
90
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20})
91
  prompt = hub.pull("rlm/rag-prompt")
 
97
  return rag_chain.invoke(text).content
98
 
99
 
100
+ def generate_groq(text, model):
101
  completion = groq_client.chat.completions.create(
102
  model=model,
103
  messages=[
 
119
  return response
120
 
121
 
 
 
 
 
 
 
 
122
  def generate_openai(text, model, openai_client):
123
  message = [{"role": "user", "content": text}]
124
  response = openai_client.chat.completions.create(
 
163
 
164
 
165
  def generate(text, model, path, api=None):
166
+ if path:
167
+ result = generate_rag(text, model, path)
168
+ if "references" not in result.lower():
169
+ result += "\n\n" + "References:"
170
+ result += "\n\n" + f"{path}"
171
+ return result
172
+ else:
173
+ print(f"Generating text for {model}...")
174
+ if model == "LLaMA 3":
175
+ return generate_groq(text, "llama3-70b-8192")
176
+ elif model == "OpenAI GPT 4o Mini":
177
+ return generate_openai(text, "gpt-4o-mini", openai_client)
178
+ elif model == "OpenAI GPT 4o":
179
+ return generate_openai(text, "gpt-4o", openai_client)
180
+ elif model == "OpenAI GPT 4":
181
+ return generate_openai(text, "gpt-4-turbo", openai_client)
182
+ elif model == "Gemini 1.5 Pro":
183
+ return generate_gemini(text, "", gemini_client)
184
+ elif model == "Claude Sonnet 3.5":
185
+ return generate_claude(text, "claude-3-5-sonnet-20240620", claude_client)
app.py CHANGED
@@ -63,23 +63,41 @@ def clean_text(text: str) -> str:
63
  return "\n".join(cleaned_paragraphs)
64
 
65
 
66
- def format_and_correct(text: str) -> str:
67
- prompt = f"""
68
- Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
69
- {text}
70
- """
71
- corrected_text = generate(prompt, "Llama 3", None)
72
- return clean_text(corrected_text)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- def format_and_correct_para(text: str) -> str:
76
- paragraphs = text.split("\n")
77
- corrected_paragraphs = []
78
- for paragraph in paragraphs:
79
- corrected = format_and_correct(paragraph)
80
- corrected_paragraphs.append(corrected)
81
- corrected_text = "\n\n".join(corrected_paragraphs)
82
- return corrected_text
 
 
 
 
 
 
83
 
84
 
85
  def format_and_correct_language_check(text: str) -> str:
@@ -109,7 +127,7 @@ def ai_generated_test(text, model="BC Original"):
109
  return predict(models[model], tokenizers[model], text)
110
 
111
 
112
- def process_text(text, model="BC Original"):
113
  # sentences = split_into_sentences(text)
114
  sentences = nltk.sent_tokenize(text)
115
  num_sentences = len(sentences)
@@ -146,12 +164,11 @@ def process_text(text, model="BC Original"):
146
  overall_scores.append(avg_score)
147
  i = i + 1
148
  combined_sentences = " ".join(colored_sentences)
149
- print(combined_sentences)
150
  colored_paragraphs.append(combined_sentences)
151
 
152
  overall_score = sum(overall_scores) / len(overall_scores)
153
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
154
- return overall_score, format_references("<br><br>".join(colored_paragraphs))
155
 
156
 
157
  ai_check_options = [
@@ -202,12 +219,14 @@ class GPT2PPL:
202
  def ai_generated_test_gptzero(text):
203
  gptzero_model = GPT2PPL()
204
  result = gptzero_model(text)
205
- print(result)
206
  return result, None
207
 
208
 
209
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
210
- return process_text(text=text, model=model)
 
 
 
211
 
212
 
213
  def ai_check(text: str, option: str):
@@ -224,7 +243,6 @@ def ai_check(text: str, option: str):
224
  def generate_prompt(settings: Dict[str, str]) -> str:
225
  prompt = f"""
226
  I am a {settings['role']}
227
-
228
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
229
 
230
  Style and Tone:
@@ -240,10 +258,11 @@ def generate_prompt(settings: Dict[str, str]) -> str:
240
  {', '.join(settings['keywords'])}
241
 
242
  Additional requirements:
 
243
  - Include {settings['num_examples']} relevant examples or case studies
244
  - Incorporate data or statistics from {', '.join(settings['references'])}
245
  - End with a {settings['conclusion_type']} conclusion
246
- - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
247
  - Do not make any headline, title bold.
248
  {settings['sources']}
249
 
@@ -256,12 +275,11 @@ def generate_prompt(settings: Dict[str, str]) -> str:
256
  def regenerate_prompt(settings: Dict[str, str]) -> str:
257
  prompt = f"""
258
  I am a {settings['role']}
259
-
260
  "{settings['generated_article']}"
261
-
262
  Edit the given text based on user comments.
263
 
264
  Comments:
 
265
  - {settings['user_comments']}
266
  - The original content should not be changed. Make minor modifications based on user comments above.
267
  - Keep the references the same as the given text in the same format.
@@ -319,30 +337,12 @@ def generate_article(
319
  else:
320
  prompt = generate_prompt(settings)
321
 
322
- print(prompt)
323
- # TODO: Why do we need this ??
324
- if ai_model in ["OpenAI GPT 3.5"]:
325
- response = openai.ChatCompletion.create(
326
- model="gpt-4" if ai_model == "OpenAI GPT 4" else "gpt-3.5-turbo",
327
- messages=[
328
- {
329
- "role": "system",
330
- "content": "You are a professional content writer with expertise in various fields.",
331
- },
332
- {"role": "user", "content": prompt},
333
- ],
334
- max_tokens=3000,
335
- n=1,
336
- stop=None,
337
- temperature=0.7,
338
- )
339
- article = response.choices[0].message.content.strip()
340
- else:
341
- article = generate(
342
- prompt,
343
- ai_model,
344
- pdf_file_input, # api_key
345
- )
346
 
347
  return clean_text(article)
348
 
@@ -355,14 +355,16 @@ def humanize(
355
  top_k: int = 50,
356
  length_penalty: float = 1,
357
  ) -> str:
 
358
  result = paraphrase_text(
359
- text=text,
360
  model_name=model,
361
  temperature=temperature,
362
  repetition_penalty=repetition_penalty,
363
  top_k=top_k,
364
  length_penalty=length_penalty,
365
  )
 
366
  return format_and_correct_language_check(result)
367
 
368
 
@@ -377,18 +379,20 @@ def format_references(text: str) -> str:
377
  lines = text.split("\n")
378
  references = []
379
  article_text = []
 
380
  in_references = False
381
 
382
  for line in lines:
383
- if (
384
- line.strip().lower() == "references"
385
- or line.strip().lower() == "references:"
386
- or line.strip().lower().startswith("references:")
387
- ):
388
  in_references = True
389
  continue
 
 
390
  if in_references:
391
- references.append(line.strip())
 
 
 
392
  else:
393
  article_text.append(line)
394
 
@@ -430,25 +434,26 @@ def generate_and_format(
430
  generated_article: str = None,
431
  user_comments: str = None,
432
  ):
433
- date_from = build_date(year_from, month_from, day_from)
434
- date_to = build_date(year_to, month_to, day_to)
435
- sorted_date = f"date:r:{date_from}:{date_to}"
436
  content_string = ""
437
- final_query = topic
438
- if include_sites:
439
- site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
440
- final_query += " " + " OR ".join(site_queries)
441
- if exclude_sites:
442
- exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
443
- final_query += " " + " ".join(exclude_queries)
444
- print(f"Final Query: {final_query}")
445
-
446
  if google_search_check:
 
 
 
 
 
 
 
 
 
 
 
447
  url_content = google_search(final_query, sorted_date, domains_to_include)
448
  content_string = "\n".join(
449
  f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
450
  )
451
- content_string = "Use the trusted information here from the URLs I've found for you:\n" + content_string
 
 
452
  article = generate_article(
453
  input_role,
454
  topic,
@@ -470,6 +475,10 @@ def generate_and_format(
470
  generated_article,
471
  user_comments,
472
  )
 
 
 
 
473
  return format_references(article)
474
 
475
 
 
63
  return "\n".join(cleaned_paragraphs)
64
 
65
 
66
+ def split_text_from_refs(text: str, sep="\n"):
67
+ lines = text.split("\n")
68
+ references = []
69
+ article_text = []
70
+ index_pattern = re.compile(r"\[(\d+)\]")
71
+ in_references = False
 
72
 
73
+ for line in lines:
74
+ if line.strip().lower() == "references" or line.strip().lower() == "references:":
75
+ in_references = True
76
+ continue
77
+ if line.strip().lower().startswith("references:"):
78
+ in_references = True
79
+ if in_references:
80
+ matches = index_pattern.split(line)
81
+ for match in matches:
82
+ if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
83
+ references.append(match.strip())
84
+ else:
85
+ article_text.append(line)
86
 
87
+ formatted_refs = []
88
+ for i, ref in enumerate(references, 1):
89
+ ref = remove_bracketed_numbers(ref)
90
+ formatted_refs.append(f"[{i}] {ref}{sep}")
91
+
92
+ return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
93
+
94
+
95
+ def ends_with_references(text):
96
+ # Define a regular expression pattern for variations of "References:"
97
+ pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
98
+
99
+ # Check if the text ends with any form of "References:"
100
+ return bool(pattern.search(text.strip()))
101
 
102
 
103
  def format_and_correct_language_check(text: str) -> str:
 
127
  return predict(models[model], tokenizers[model], text)
128
 
129
 
130
+ def detection_polygraf(text, model="BC Original"):
131
  # sentences = split_into_sentences(text)
132
  sentences = nltk.sent_tokenize(text)
133
  num_sentences = len(sentences)
 
164
  overall_scores.append(avg_score)
165
  i = i + 1
166
  combined_sentences = " ".join(colored_sentences)
 
167
  colored_paragraphs.append(combined_sentences)
168
 
169
  overall_score = sum(overall_scores) / len(overall_scores)
170
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
171
+ return overall_score, "<br><br>".join(colored_paragraphs)
172
 
173
 
174
  ai_check_options = [
 
219
  def ai_generated_test_gptzero(text):
220
  gptzero_model = GPT2PPL()
221
  result = gptzero_model(text)
 
222
  return result, None
223
 
224
 
225
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
226
+ body, references = split_text_from_refs(text, "<br>")
227
+ score, text = detection_polygraf(text=body, model=model)
228
+ text = text + "<br>" + references
229
+ return score, text
230
 
231
 
232
  def ai_check(text: str, option: str):
 
243
  def generate_prompt(settings: Dict[str, str]) -> str:
244
  prompt = f"""
245
  I am a {settings['role']}
 
246
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
247
 
248
  Style and Tone:
 
258
  {', '.join(settings['keywords'])}
259
 
260
  Additional requirements:
261
+ - Don't start with "Here is a...", start with the requested text directly
262
  - Include {settings['num_examples']} relevant examples or case studies
263
  - Incorporate data or statistics from {', '.join(settings['references'])}
264
  - End with a {settings['conclusion_type']} conclusion
265
+ - Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
266
  - Do not make any headline, title bold.
267
  {settings['sources']}
268
 
 
275
  def regenerate_prompt(settings: Dict[str, str]) -> str:
276
  prompt = f"""
277
  I am a {settings['role']}
 
278
  "{settings['generated_article']}"
 
279
  Edit the given text based on user comments.
280
 
281
  Comments:
282
+ - Don't start with "Here is a...", start with the requested text directly
283
  - {settings['user_comments']}
284
  - The original content should not be changed. Make minor modifications based on user comments above.
285
  - Keep the references the same as the given text in the same format.
 
337
  else:
338
  prompt = generate_prompt(settings)
339
 
340
+ print("Generated Prompt...\n", prompt)
341
+ article = generate(
342
+ prompt,
343
+ ai_model,
344
+ pdf_file_input, # api_key
345
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  return clean_text(article)
348
 
 
355
  top_k: int = 50,
356
  length_penalty: float = 1,
357
  ) -> str:
358
+ body, references = split_text_from_refs(text)
359
  result = paraphrase_text(
360
+ text=body,
361
  model_name=model,
362
  temperature=temperature,
363
  repetition_penalty=repetition_penalty,
364
  top_k=top_k,
365
  length_penalty=length_penalty,
366
  )
367
+ result = result + "\n\n" + references
368
  return format_and_correct_language_check(result)
369
 
370
 
 
379
  lines = text.split("\n")
380
  references = []
381
  article_text = []
382
+ index_pattern = re.compile(r"\[(\d+)\]")
383
  in_references = False
384
 
385
  for line in lines:
386
+ if line.strip().lower() == "references" or line.strip().lower() == "references:":
 
 
 
 
387
  in_references = True
388
  continue
389
+ if line.strip().lower().startswith("references:"):
390
+ in_references = True
391
  if in_references:
392
+ matches = index_pattern.split(line)
393
+ for match in matches:
394
+ if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
395
+ references.append(match.strip())
396
  else:
397
  article_text.append(line)
398
 
 
434
  generated_article: str = None,
435
  user_comments: str = None,
436
  ):
 
 
 
437
  content_string = ""
 
 
 
 
 
 
 
 
 
438
  if google_search_check:
439
+ date_from = build_date(year_from, month_from, day_from)
440
+ date_to = build_date(year_to, month_to, day_to)
441
+ sorted_date = f"date:r:{date_from}:{date_to}"
442
+ final_query = topic
443
+ if include_sites:
444
+ site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
445
+ final_query += " " + " OR ".join(site_queries)
446
+ if exclude_sites:
447
+ exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
448
+ final_query += " " + " ".join(exclude_queries)
449
+ print(f"Google Search Query: {final_query}")
450
  url_content = google_search(final_query, sorted_date, domains_to_include)
451
  content_string = "\n".join(
452
  f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
453
  )
454
+ content_string = (
455
+ "Use the trusted information here from the URLs and add them as References:\n" + content_string
456
+ )
457
  article = generate_article(
458
  input_role,
459
  topic,
 
475
  generated_article,
476
  user_comments,
477
  )
478
+ if ends_with_references(article) and url_content is not None:
479
+ for url in url_content.keys():
480
+ article += f"\n{url}"
481
+
482
  return format_references(article)
483
 
484
 
plagiarism.py CHANGED
@@ -15,7 +15,8 @@ def clean_html(text):
15
  result += article.title + "\n"
16
  paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
  for paragraph in paragraphs:
18
- result += paragraph.text
 
19
  return result
20
 
21
 
@@ -128,6 +129,7 @@ def google_search(
128
  break
129
  if soup:
130
  text = clean_html(soup.text)
131
- result_content[url] = text
132
- count += 1
 
133
  return result_content
 
15
  result += article.title + "\n"
16
  paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
  for paragraph in paragraphs:
18
+ if not paragraph.is_boilerplate:
19
+ result += paragraph.text
20
  return result
21
 
22
 
 
129
  break
130
  if soup:
131
  text = clean_html(soup.text)
132
+ if len(text) > 500:
133
+ result_content[url] = text
134
+ count += 1
135
  return result_content
requirements.txt CHANGED
@@ -21,4 +21,8 @@ pymupdf
21
  chromadb
22
  language-tool-python
23
  anthropic
24
- google-generativeai
 
 
 
 
 
21
  chromadb
22
  language-tool-python
23
  anthropic
24
+ google-generativeai
25
+ langchain-google-genai
26
+ langchain-anthropic
27
+ langchain-openai
28
+ vertexai