eljanmahammadli commited on
Commit
34b1950
·
1 Parent(s): a97d561

enabled RAG for all LLMs + prompt improvements

Browse files
Files changed (4) hide show
  1. ai_generate.py +52 -26
  2. app.py +78 -68
  3. plagiarism.py +5 -3
  4. requirements.txt +3 -0
ai_generate.py CHANGED
@@ -21,6 +21,9 @@ from langchain_core.output_parsers import StrOutputParser
21
  from langchain_core.runnables import RunnablePassthrough
22
  from langchain.chains import RetrievalQA
23
  from langchain_groq import ChatGroq
 
 
 
24
  from dotenv import load_dotenv
25
 
26
  load_dotenv()
@@ -40,6 +43,31 @@ gemini_client = GenerativeModel("gemini-1.5-pro-001")
40
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def create_db_with_langchain(path):
44
  loader = PyMuPDFLoader(path)
45
  data = loader.load()
@@ -55,11 +83,9 @@ def create_db_with_langchain(path):
55
  return db
56
 
57
 
58
- def generate_groq_rag(text, model, path):
59
- llm = ChatGroq(
60
- temperature=0,
61
- model_name=model,
62
- )
63
  db = create_db_with_langchain(path)
64
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20})
65
  prompt = hub.pull("rlm/rag-prompt")
@@ -71,7 +97,7 @@ def generate_groq_rag(text, model, path):
71
  return rag_chain.invoke(text).content
72
 
73
 
74
- def generate_groq_base(text, model):
75
  completion = groq_client.chat.completions.create(
76
  model=model,
77
  messages=[
@@ -93,13 +119,6 @@ def generate_groq_base(text, model):
93
  return response
94
 
95
 
96
- def generate_groq(text, model, path):
97
- if path:
98
- return generate_groq_rag(text, model, path)
99
- else:
100
- return generate_groq_base(text, model)
101
-
102
-
103
  def generate_openai(text, model, openai_client):
104
  message = [{"role": "user", "content": text}]
105
  response = openai_client.chat.completions.create(
@@ -144,16 +163,23 @@ def generate_claude(text, model, claude_client):
144
 
145
 
146
  def generate(text, model, path, api=None):
147
-
148
- if model == "LLaMA 3":
149
- return generate_groq(text, "llama3-70b-8192", path)
150
- elif model == "OpenAI GPT 4o Mini":
151
- return generate_openai(text, "gpt-4o-mini", openai_client)
152
- elif model == "OpenAI GPT 4o":
153
- return generate_openai(text, "gpt-4o", openai_client)
154
- elif model == "OpenAI GPT 4":
155
- return generate_openai(text, "gpt-4-turbo", openai_client)
156
- elif model == "Gemini 1.5 Pro":
157
- return generate_gemini(text, "", gemini_client)
158
- elif model == "Claude Sonnet 3.5":
159
- return generate_claude(text, "claude-3-5-sonnet-20240620", claude_client)
 
 
 
 
 
 
 
 
21
  from langchain_core.runnables import RunnablePassthrough
22
  from langchain.chains import RetrievalQA
23
  from langchain_groq import ChatGroq
24
+ from langchain_openai import ChatOpenAI
25
+ from langchain_google_genai import ChatGoogleGenerativeAI
26
+ from langchain_anthropic import ChatAnthropic
27
  from dotenv import load_dotenv
28
 
29
  load_dotenv()
 
43
  claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
44
 
45
 
46
+ rag_llms = {
47
+ "LLaMA 3": ChatGroq(
48
+ temperature=0,
49
+ model_name="llama3-70b-8192",
50
+ ),
51
+ "OpenAI GPT 4o Mini": ChatOpenAI(
52
+ temperature=0,
53
+ model_name="gpt-4o-mini",
54
+ ),
55
+ "OpenAI GPT 4o": ChatOpenAI(
56
+ temperature=0,
57
+ model_name="gpt-4o",
58
+ ),
59
+ "OpenAI GPT 4": ChatOpenAI(
60
+ temperature=0,
61
+ model_name="gpt-4-turbo",
62
+ ),
63
+ "Gemini 1.5 Pro": ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-pro"),
64
+ "Claude Sonnet 3.5": ChatAnthropic(
65
+ temperature=0,
66
+ model_name="claude-3-5-sonnet-20240620",
67
+ ),
68
+ }
69
+
70
+
71
  def create_db_with_langchain(path):
72
  loader = PyMuPDFLoader(path)
73
  data = loader.load()
 
83
  return db
84
 
85
 
86
+ def generate_rag(text, model, path):
87
+ print(f"Generating text using RAG for {model}...")
88
+ llm = rag_llms[model]
 
 
89
  db = create_db_with_langchain(path)
90
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20})
91
  prompt = hub.pull("rlm/rag-prompt")
 
97
  return rag_chain.invoke(text).content
98
 
99
 
100
+ def generate_groq(text, model):
101
  completion = groq_client.chat.completions.create(
102
  model=model,
103
  messages=[
 
119
  return response
120
 
121
 
 
 
 
 
 
 
 
122
  def generate_openai(text, model, openai_client):
123
  message = [{"role": "user", "content": text}]
124
  response = openai_client.chat.completions.create(
 
163
 
164
 
165
  def generate(text, model, path, api=None):
166
+ if path:
167
+ result = generate_rag(text, model, path)
168
+ if "references" not in result.lower():
169
+ result += "\n\n" + "References:"
170
+ result += "\n\n" + f"{path}"
171
+ return result
172
+ else:
173
+ print(f"Generating text for {model}...")
174
+ if model == "LLaMA 3":
175
+ return generate_groq(text, "llama3-70b-8192")
176
+ elif model == "OpenAI GPT 4o Mini":
177
+ return generate_openai(text, "gpt-4o-mini", openai_client)
178
+ elif model == "OpenAI GPT 4o":
179
+ return generate_openai(text, "gpt-4o", openai_client)
180
+ elif model == "OpenAI GPT 4":
181
+ return generate_openai(text, "gpt-4-turbo", openai_client)
182
+ elif model == "Gemini 1.5 Pro":
183
+ return generate_gemini(text, "", gemini_client)
184
+ elif model == "Claude Sonnet 3.5":
185
+ return generate_claude(text, "claude-3-5-sonnet-20240620", claude_client)
app.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  nohup python3 app.py &
3
  """
 
4
  import openai
5
  import gradio as gr
6
  from typing import Dict, List
@@ -62,23 +63,41 @@ def clean_text(text: str) -> str:
62
  return "\n".join(cleaned_paragraphs)
63
 
64
 
65
- def format_and_correct(text: str) -> str:
66
- prompt = f"""
67
- Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
68
- {text}
69
- """
70
- corrected_text = generate(prompt, "Llama 3", None)
71
- return clean_text(corrected_text)
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- def format_and_correct_para(text: str) -> str:
75
- paragraphs = text.split("\n")
76
- corrected_paragraphs = []
77
- for paragraph in paragraphs:
78
- corrected = format_and_correct(paragraph)
79
- corrected_paragraphs.append(corrected)
80
- corrected_text = "\n\n".join(corrected_paragraphs)
81
- return corrected_text
 
 
 
 
 
 
82
 
83
 
84
  def format_and_correct_language_check(text: str) -> str:
@@ -108,7 +127,7 @@ def ai_generated_test(text, model="BC Original"):
108
  return predict(models[model], tokenizers[model], text)
109
 
110
 
111
- def process_text(text, model="BC Original"):
112
  # sentences = split_into_sentences(text)
113
  sentences = nltk.sent_tokenize(text)
114
  num_sentences = len(sentences)
@@ -145,12 +164,11 @@ def process_text(text, model="BC Original"):
145
  overall_scores.append(avg_score)
146
  i = i + 1
147
  combined_sentences = " ".join(colored_sentences)
148
- print(combined_sentences)
149
  colored_paragraphs.append(combined_sentences)
150
 
151
  overall_score = sum(overall_scores) / len(overall_scores)
152
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
153
- return overall_score, format_references("<br><br>".join(colored_paragraphs))
154
 
155
 
156
  ai_check_options = [
@@ -201,12 +219,14 @@ class GPT2PPL:
201
  def ai_generated_test_gptzero(text):
202
  gptzero_model = GPT2PPL()
203
  result = gptzero_model(text)
204
- print(result)
205
  return result, None
206
 
207
 
208
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
209
- return process_text(text=text, model=model)
 
 
 
210
 
211
 
212
  def ai_check(text: str, option: str):
@@ -223,7 +243,6 @@ def ai_check(text: str, option: str):
223
  def generate_prompt(settings: Dict[str, str]) -> str:
224
  prompt = f"""
225
  I am a {settings['role']}
226
-
227
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
228
 
229
  Style and Tone:
@@ -239,10 +258,11 @@ def generate_prompt(settings: Dict[str, str]) -> str:
239
  {', '.join(settings['keywords'])}
240
 
241
  Additional requirements:
 
242
  - Include {settings['num_examples']} relevant examples or case studies
243
  - Incorporate data or statistics from {', '.join(settings['references'])}
244
  - End with a {settings['conclusion_type']} conclusion
245
- - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
246
  - Do not make any headline, title bold.
247
  {settings['sources']}
248
 
@@ -255,12 +275,11 @@ def generate_prompt(settings: Dict[str, str]) -> str:
255
  def regenerate_prompt(settings: Dict[str, str]) -> str:
256
  prompt = f"""
257
  I am a {settings['role']}
258
-
259
  "{settings['generated_article']}"
260
-
261
  Edit the given text based on user comments.
262
 
263
  Comments:
 
264
  - {settings['user_comments']}
265
  - The original content should not be changed. Make minor modifications based on user comments above.
266
  - Keep the references the same as the given text in the same format.
@@ -318,30 +337,12 @@ def generate_article(
318
  else:
319
  prompt = generate_prompt(settings)
320
 
321
- print(prompt)
322
- # TODO: Why do we need this ??
323
- if ai_model in ["OpenAI GPT 3.5"]:
324
- response = openai.ChatCompletion.create(
325
- model="gpt-4" if ai_model == "OpenAI GPT 4" else "gpt-3.5-turbo",
326
- messages=[
327
- {
328
- "role": "system",
329
- "content": "You are a professional content writer with expertise in various fields.",
330
- },
331
- {"role": "user", "content": prompt},
332
- ],
333
- max_tokens=3000,
334
- n=1,
335
- stop=None,
336
- temperature=0.7,
337
- )
338
- article = response.choices[0].message.content.strip()
339
- else:
340
- article = generate(
341
- prompt,
342
- ai_model,
343
- pdf_file_input, # api_key
344
- )
345
 
346
  return clean_text(article)
347
 
@@ -354,14 +355,16 @@ def humanize(
354
  top_k: int = 50,
355
  length_penalty: float = 1,
356
  ) -> str:
 
357
  result = paraphrase_text(
358
- text=text,
359
  model_name=model,
360
  temperature=temperature,
361
  repetition_penalty=repetition_penalty,
362
  top_k=top_k,
363
  length_penalty=length_penalty,
364
  )
 
365
  return format_and_correct_language_check(result)
366
 
367
 
@@ -376,18 +379,20 @@ def format_references(text: str) -> str:
376
  lines = text.split("\n")
377
  references = []
378
  article_text = []
 
379
  in_references = False
380
 
381
  for line in lines:
382
- if (
383
- line.strip().lower() == "references"
384
- or line.strip().lower() == "references:"
385
- or line.strip().lower().startswith("references:")
386
- ):
387
  in_references = True
388
  continue
 
 
389
  if in_references:
390
- references.append(line.strip())
 
 
 
391
  else:
392
  article_text.append(line)
393
 
@@ -429,25 +434,26 @@ def generate_and_format(
429
  generated_article: str = None,
430
  user_comments: str = None,
431
  ):
432
- date_from = build_date(year_from, month_from, day_from)
433
- date_to = build_date(year_to, month_to, day_to)
434
- sorted_date = f"date:r:{date_from}:{date_to}"
435
  content_string = ""
436
- final_query = topic
437
- if include_sites:
438
- site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
439
- final_query += " " + " OR ".join(site_queries)
440
- if exclude_sites:
441
- exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
442
- final_query += " " + " ".join(exclude_queries)
443
- print(f"Final Query: {final_query}")
444
-
445
  if google_search_check:
 
 
 
 
 
 
 
 
 
 
 
446
  url_content = google_search(final_query, sorted_date, domains_to_include)
447
  content_string = "\n".join(
448
  f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
449
  )
450
- content_string = "Use the trusted information here from the URLs I've found for you:\n" + content_string
 
 
451
  article = generate_article(
452
  input_role,
453
  topic,
@@ -469,6 +475,10 @@ def generate_and_format(
469
  generated_article,
470
  user_comments,
471
  )
 
 
 
 
472
  return format_references(article)
473
 
474
 
 
1
  """
2
  nohup python3 app.py &
3
  """
4
+
5
  import openai
6
  import gradio as gr
7
  from typing import Dict, List
 
63
  return "\n".join(cleaned_paragraphs)
64
 
65
 
66
+ def split_text_from_refs(text: str, sep="\n"):
67
+ lines = text.split("\n")
68
+ references = []
69
+ article_text = []
70
+ index_pattern = re.compile(r"\[(\d+)\]")
71
+ in_references = False
 
72
 
73
+ for line in lines:
74
+ if line.strip().lower() == "references" or line.strip().lower() == "references:":
75
+ in_references = True
76
+ continue
77
+ if line.strip().lower().startswith("references:"):
78
+ in_references = True
79
+ if in_references:
80
+ matches = index_pattern.split(line)
81
+ for match in matches:
82
+ if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
83
+ references.append(match.strip())
84
+ else:
85
+ article_text.append(line)
86
 
87
+ formatted_refs = []
88
+ for i, ref in enumerate(references, 1):
89
+ ref = remove_bracketed_numbers(ref)
90
+ formatted_refs.append(f"[{i}] {ref}{sep}")
91
+
92
+ return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
93
+
94
+
95
+ def ends_with_references(text):
96
+ # Define a regular expression pattern for variations of "References:"
97
+ pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
98
+
99
+ # Check if the text ends with any form of "References:"
100
+ return bool(pattern.search(text.strip()))
101
 
102
 
103
  def format_and_correct_language_check(text: str) -> str:
 
127
  return predict(models[model], tokenizers[model], text)
128
 
129
 
130
+ def detection_polygraf(text, model="BC Original"):
131
  # sentences = split_into_sentences(text)
132
  sentences = nltk.sent_tokenize(text)
133
  num_sentences = len(sentences)
 
164
  overall_scores.append(avg_score)
165
  i = i + 1
166
  combined_sentences = " ".join(colored_sentences)
 
167
  colored_paragraphs.append(combined_sentences)
168
 
169
  overall_score = sum(overall_scores) / len(overall_scores)
170
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
171
+ return overall_score, "<br><br>".join(colored_paragraphs)
172
 
173
 
174
  ai_check_options = [
 
219
  def ai_generated_test_gptzero(text):
220
  gptzero_model = GPT2PPL()
221
  result = gptzero_model(text)
 
222
  return result, None
223
 
224
 
225
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
226
+ body, references = split_text_from_refs(text, "<br>")
227
+ score, text = detection_polygraf(text=body, model=model)
228
+ text = text + "<br>" + references
229
+ return score, text
230
 
231
 
232
  def ai_check(text: str, option: str):
 
243
  def generate_prompt(settings: Dict[str, str]) -> str:
244
  prompt = f"""
245
  I am a {settings['role']}
 
246
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
247
 
248
  Style and Tone:
 
258
  {', '.join(settings['keywords'])}
259
 
260
  Additional requirements:
261
+ - Don't start with "Here is a...", start with the requested text directly
262
  - Include {settings['num_examples']} relevant examples or case studies
263
  - Incorporate data or statistics from {', '.join(settings['references'])}
264
  - End with a {settings['conclusion_type']} conclusion
265
+ - Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
266
  - Do not make any headline, title bold.
267
  {settings['sources']}
268
 
 
275
  def regenerate_prompt(settings: Dict[str, str]) -> str:
276
  prompt = f"""
277
  I am a {settings['role']}
 
278
  "{settings['generated_article']}"
 
279
  Edit the given text based on user comments.
280
 
281
  Comments:
282
+ - Don't start with "Here is a...", start with the requested text directly
283
  - {settings['user_comments']}
284
  - The original content should not be changed. Make minor modifications based on user comments above.
285
  - Keep the references the same as the given text in the same format.
 
337
  else:
338
  prompt = generate_prompt(settings)
339
 
340
+ print("Generated Prompt...\n", prompt)
341
+ article = generate(
342
+ prompt,
343
+ ai_model,
344
+ pdf_file_input, # api_key
345
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  return clean_text(article)
348
 
 
355
  top_k: int = 50,
356
  length_penalty: float = 1,
357
  ) -> str:
358
+ body, references = split_text_from_refs(text)
359
  result = paraphrase_text(
360
+ text=body,
361
  model_name=model,
362
  temperature=temperature,
363
  repetition_penalty=repetition_penalty,
364
  top_k=top_k,
365
  length_penalty=length_penalty,
366
  )
367
+ result = result + "\n\n" + references
368
  return format_and_correct_language_check(result)
369
 
370
 
 
379
  lines = text.split("\n")
380
  references = []
381
  article_text = []
382
+ index_pattern = re.compile(r"\[(\d+)\]")
383
  in_references = False
384
 
385
  for line in lines:
386
+ if line.strip().lower() == "references" or line.strip().lower() == "references:":
 
 
 
 
387
  in_references = True
388
  continue
389
+ if line.strip().lower().startswith("references:"):
390
+ in_references = True
391
  if in_references:
392
+ matches = index_pattern.split(line)
393
+ for match in matches:
394
+ if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
395
+ references.append(match.strip())
396
  else:
397
  article_text.append(line)
398
 
 
434
  generated_article: str = None,
435
  user_comments: str = None,
436
  ):
 
 
 
437
  content_string = ""
 
 
 
 
 
 
 
 
 
438
  if google_search_check:
439
+ date_from = build_date(year_from, month_from, day_from)
440
+ date_to = build_date(year_to, month_to, day_to)
441
+ sorted_date = f"date:r:{date_from}:{date_to}"
442
+ final_query = topic
443
+ if include_sites:
444
+ site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
445
+ final_query += " " + " OR ".join(site_queries)
446
+ if exclude_sites:
447
+ exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
448
+ final_query += " " + " ".join(exclude_queries)
449
+ print(f"Google Search Query: {final_query}")
450
  url_content = google_search(final_query, sorted_date, domains_to_include)
451
  content_string = "\n".join(
452
  f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
453
  )
454
+ content_string = (
455
+ "Use the trusted information here from the URLs and add them as References:\n" + content_string
456
+ )
457
  article = generate_article(
458
  input_role,
459
  topic,
 
475
  generated_article,
476
  user_comments,
477
  )
478
+ if ends_with_references(article) and url_content is not None:
479
+ for url in url_content.keys():
480
+ article += f"\n{url}"
481
+
482
  return format_references(article)
483
 
484
 
plagiarism.py CHANGED
@@ -15,7 +15,8 @@ def clean_html(text):
15
  result += article.title + "\n"
16
  paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
  for paragraph in paragraphs:
18
- result += paragraph.text
 
19
  return result
20
 
21
 
@@ -128,6 +129,7 @@ def google_search(
128
  break
129
  if soup:
130
  text = clean_html(soup.text)
131
- result_content[url] = text
132
- count += 1
 
133
  return result_content
 
15
  result += article.title + "\n"
16
  paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
  for paragraph in paragraphs:
18
+ if not paragraph.is_boilerplate:
19
+ result += paragraph.text
20
  return result
21
 
22
 
 
129
  break
130
  if soup:
131
  text = clean_html(soup.text)
132
+ if len(text) > 500:
133
+ result_content[url] = text
134
+ count += 1
135
  return result_content
requirements.txt CHANGED
@@ -22,4 +22,7 @@ chromadb
22
  language-tool-python
23
  anthropic
24
  google-generativeai
 
 
 
25
  vertexai
 
22
  language-tool-python
23
  anthropic
24
  google-generativeai
25
+ langchain-google-genai
26
+ langchain-anthropic
27
+ langchain-openai
28
  vertexai