minko186 commited on
Commit
38c42ed
·
2 Parent(s): 134b51f 2d6909b

Merge branch 'main' into staging

Browse files
Files changed (6) hide show
  1. .gitignore +3 -1
  2. ai_generate.py +65 -27
  3. app.py +70 -37
  4. nohup.out +0 -0
  5. plagiarism.py +1 -0
  6. requirements.txt +3 -1
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  __pycache__/
2
  .env
3
- nohup.out
 
 
 
1
  __pycache__/
2
  .env
3
+ nohup.out
4
+ *.out
5
+ *.log
ai_generate.py CHANGED
@@ -3,6 +3,12 @@ from openai import OpenAI
3
  import os
4
  from transformers import pipeline
5
  from groq import Groq
 
 
 
 
 
 
6
  from langchain_community.document_loaders import PyMuPDFLoader
7
  from langchain_community.document_loaders import TextLoader
8
  from langchain_community.embeddings.sentence_transformer import (
@@ -19,9 +25,20 @@ from dotenv import load_dotenv
19
 
20
  load_dotenv()
21
 
 
 
 
22
  groq_client = Groq(
23
  api_key=os.environ.get("GROQ_API_KEY"),
24
  )
 
 
 
 
 
 
 
 
25
 
26
 
27
  def create_db_with_langchain(path):
@@ -67,7 +84,6 @@ def generate_groq_base(text, model):
67
  ],
68
  temperature=1,
69
  max_tokens=1024,
70
- top_p=1,
71
  stream=True,
72
  stop=None,
73
  )
@@ -88,35 +104,57 @@ def generate_groq(text, model, path):
88
  def generate_openai(text, model, openai_client):
89
  message = [{"role": "user", "content": text}]
90
  response = openai_client.chat.completions.create(
91
- model=model, messages=message, temperature=0.2, max_tokens=800, frequency_penalty=0.0
 
 
 
92
  )
93
  return response.choices[0].message.content
94
 
95
 
96
- def generate(text, model, path, api):
97
- if model == "Llama 3":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  return generate_groq(text, "llama3-70b-8192", path)
99
- elif model == "Groq":
100
- return generate_groq(text, "llama3-groq-70b-8192-tool-use-preview", path)
101
- elif model == "Mistral":
102
- return generate_groq(text, "mixtral-8x7b-32768", path)
103
- elif model == "Gemma":
104
- return generate_groq(text, "gemma2-9b-it", path)
105
- elif model == "OpenAI GPT 3.5":
106
- try:
107
- openai_client = OpenAI(api_key=api)
108
- return generate_openai(text, "gpt-3.5-turbo", openai_client)
109
- except:
110
- return "Please add a valid API key"
111
- elif model == "OpenAI GPT 4":
112
- try:
113
- openai_client = OpenAI(api_key=api)
114
- return generate_openai(text, "gpt-4-turbo", openai_client)
115
- except:
116
- return "Please add a valid API key"
117
  elif model == "OpenAI GPT 4o":
118
- try:
119
- openai_client = OpenAI(api_key=api)
120
- return generate_openai(text, "gpt-4o", openai_client)
121
- except:
122
- return "Please add a valid API key"
 
 
 
3
  import os
4
  from transformers import pipeline
5
  from groq import Groq
6
+ import base64
7
+ import vertexai
8
+ from vertexai.generative_models import GenerativeModel, Part, FinishReason
9
+ import vertexai.preview.generative_models as generative_models
10
+ import google.generativeai as genai
11
+ import anthropic
12
  from langchain_community.document_loaders import PyMuPDFLoader
13
  from langchain_community.document_loaders import TextLoader
14
  from langchain_community.embeddings.sentence_transformer import (
 
25
 
26
  load_dotenv()
27
 
28
+ os.environ["GRPC_VERBOSITY"] = "ERROR"
29
+ os.environ["GLOG_minloglevel"] = "2"
30
+
31
  groq_client = Groq(
32
  api_key=os.environ.get("GROQ_API_KEY"),
33
  )
34
+ openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
35
+ # give access to all APIs for GCP instance
36
+ # gcloud auth application-default login
37
+ genai.configure(api_key=os.environ.get("GENAI_API_KEY"))
38
+ vertexai.init(project="proprietary-info-detection", location="us-central1")
39
+ gemini_client = GenerativeModel("gemini-1.5-pro-001")
40
+ claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
41
+
42
 
43
 
44
  def create_db_with_langchain(path):
 
84
  ],
85
  temperature=1,
86
  max_tokens=1024,
 
87
  stream=True,
88
  stop=None,
89
  )
 
104
  def generate_openai(text, model, openai_client):
105
  message = [{"role": "user", "content": text}]
106
  response = openai_client.chat.completions.create(
107
+ model=model,
108
+ messages=message,
109
+ temperature=1,
110
+ max_tokens=1024,
111
  )
112
  return response.choices[0].message.content
113
 
114
 
115
+ def generate_gemini(text, model, gemini_client):
116
+ safety_settings = {
117
+ generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
118
+ generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
119
+ generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
120
+ generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
121
+ }
122
+ generation_config = {
123
+ "max_output_tokens": 1024,
124
+ "temperature": 1.0,
125
+ "top_p": 1.0,
126
+ }
127
+ response = gemini_client.generate_content(
128
+ [text],
129
+ generation_config=generation_config,
130
+ safety_settings=safety_settings,
131
+ stream=False,
132
+ )
133
+ return response.text
134
+
135
+
136
+ def generate_claude(text, model, claude_client):
137
+ response = claude_client.messages.create(
138
+ model=model,
139
+ max_tokens=1024,
140
+ temperature=1.0,
141
+ system="You are helpful assistant.",
142
+ messages=[{"role": "user", "content": [{"type": "text", "text": text}]}],
143
+ )
144
+ return response.content[0].text.strip()
145
+
146
+
147
+ def generate(text, model, path, api=None):
148
+
149
+ if model == "LLaMA 3":
150
  return generate_groq(text, "llama3-70b-8192", path)
151
+ elif model == "OpenAI GPT 4o Mini":
152
+ return generate_openai(text, "gpt-4o-mini", openai_client)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  elif model == "OpenAI GPT 4o":
154
+ return generate_openai(text, "gpt-4o", openai_client)
155
+ elif model == "OpenAI GPT 4":
156
+ return generate_openai(text, "gpt-4-turbo", openai_client)
157
+ elif model == "Gemini 1.5 Pro":
158
+ return generate_gemini(text, "", gemini_client)
159
+ elif model == "Claude Sonnet 3.5":
160
+ return generate_claude(text, "claude-3-5-sonnet-20240620", claude_client)
app.py CHANGED
@@ -1,9 +1,11 @@
 
 
 
 
1
  import openai
2
  import gradio as gr
3
  from typing import Dict, List
4
  import re
5
- from humanize import paraphrase_text
6
- from ai_generate import generate
7
  import requests
8
  import language_tool_python
9
  import torch
@@ -13,12 +15,13 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
13
  from scipy.special import softmax
14
  from collections import defaultdict
15
  import nltk
 
 
16
  from utils import remove_special_characters
17
  from plagiarism import google_search, months, domain_list, build_date
18
- from datetime import date
 
19
 
20
- # Check if CUDA is available
21
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
  print(f"Using device: {device}")
23
 
24
  models = {
@@ -65,7 +68,7 @@ def format_and_correct(text: str) -> str:
65
  Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
66
  {text}
67
  """
68
- corrected_text = generate(prompt, "Groq", None)
69
  return clean_text(corrected_text)
70
 
71
 
@@ -287,7 +290,7 @@ def generate_article(
287
  conclusion_type: str,
288
  ai_model: str,
289
  content_string: str,
290
- api_key: str = None,
291
  pdf_file_input=None,
292
  generated_article: str = None,
293
  user_comments: str = None,
@@ -317,7 +320,8 @@ def generate_article(
317
  prompt = generate_prompt(settings)
318
 
319
  print(prompt)
320
- if ai_model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
 
321
  response = openai.ChatCompletion.create(
322
  model="gpt-4" if ai_model == "OpenAI GPT 4" else "gpt-3.5-turbo",
323
  messages=[
@@ -334,7 +338,11 @@ def generate_article(
334
  )
335
  article = response.choices[0].message.content.strip()
336
  else:
337
- article = generate(prompt, ai_model, pdf_file_input, api_key)
 
 
 
 
338
 
339
  return clean_text(article)
340
 
@@ -407,7 +415,7 @@ def generate_and_format(
407
  num_examples,
408
  conclusion_type,
409
  ai_model,
410
- api_key,
411
  google_search_check,
412
  year_from,
413
  month_from,
@@ -416,7 +424,8 @@ def generate_and_format(
416
  month_to,
417
  day_to,
418
  domains_to_include,
419
- search_keywords,
 
420
  pdf_file_input,
421
  generated_article: str = None,
422
  user_comments: str = None,
@@ -426,10 +435,14 @@ def generate_and_format(
426
  sorted_date = f"date:r:{date_from}:{date_to}"
427
  content_string = ""
428
  final_query = topic
429
- if search_keywords != "":
430
- quoted_keywords = [f'"{keyword.strip()}"' for keyword in search_keywords.split(",")]
431
- final_query = final_query + " " + " ".join(quoted_keywords)
432
- print(final_query)
 
 
 
 
433
  if google_search_check:
434
  url_content = google_search(final_query, sorted_date, domains_to_include)
435
  content_string = "\n".join(
@@ -452,7 +465,7 @@ def generate_and_format(
452
  conclusion_type,
453
  ai_model,
454
  content_string,
455
- api_key,
456
  pdf_file_input,
457
  generated_article,
458
  user_comments,
@@ -501,6 +514,11 @@ def create_interface():
501
  "Research paper",
502
  "News article",
503
  "White paper",
 
 
 
 
 
504
  ],
505
  value="Article",
506
  label="Format",
@@ -602,6 +620,25 @@ def create_interface():
602
  with gr.Row():
603
  google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=True)
604
  with gr.Group(visible=True) as search_options:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  with gr.Row():
606
  month_from = gr.Dropdown(
607
  choices=months,
@@ -622,32 +659,26 @@ def create_interface():
622
  day_to = gr.Textbox(label="To Day", value=d1[0])
623
  year_to = gr.Textbox(label="To Year", value=d1[2])
624
 
625
- with gr.Row():
626
- domains_to_include = gr.Dropdown(
627
- domain_list,
628
- value=domain_list,
629
- multiselect=True,
630
- label="Domains To Include",
631
- )
632
- with gr.Row():
633
- search_keywords = gr.Textbox(
634
- label="Keywords",
635
- placeholder="Enter comma-separated keywords",
636
- elem_classes="input-highlight-yellow",
637
- )
638
  gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
639
  pdf_file_input = gr.File(label="Upload PDF")
640
 
641
  with gr.Group():
642
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
643
  ai_generator = gr.Dropdown(
644
- choices=["Llama 3", "Groq", "Mistral", "Gemma", "OpenAI GPT 3.5", "OpenAI GPT 4"],
645
- value="Llama 3",
 
 
 
 
 
 
 
646
  label="AI Model",
647
  elem_classes="input-highlight-pink",
648
  )
649
- input_api = gr.Textbox(label="API Key", visible=False)
650
- ai_generator.change(update_visibility_api, ai_generator, input_api)
651
 
652
  generate_btn = gr.Button("Generate Article", variant="primary")
653
 
@@ -738,7 +769,7 @@ def create_interface():
738
  input_num_examples,
739
  input_conclusion,
740
  ai_generator,
741
- input_api,
742
  google_search_check,
743
  year_from,
744
  month_from,
@@ -747,7 +778,8 @@ def create_interface():
747
  month_to,
748
  day_to,
749
  domains_to_include,
750
- search_keywords,
 
751
  pdf_file_input,
752
  ],
753
  outputs=[output_article],
@@ -770,7 +802,7 @@ def create_interface():
770
  input_num_examples,
771
  input_conclusion,
772
  ai_generator,
773
- input_api,
774
  google_search_check,
775
  year_from,
776
  month_from,
@@ -781,7 +813,8 @@ def create_interface():
781
  domains_to_include,
782
  pdf_file_input,
783
  output_article,
784
- search_keywords,
 
785
  ai_comments,
786
  ],
787
  outputs=[output_article],
 
1
+ """
2
+ nohup python3 app.py &
3
+ """
4
+
5
  import openai
6
  import gradio as gr
7
  from typing import Dict, List
8
  import re
 
 
9
  import requests
10
  import language_tool_python
11
  import torch
 
15
  from scipy.special import softmax
16
  from collections import defaultdict
17
  import nltk
18
+ from datetime import date
19
+
20
  from utils import remove_special_characters
21
  from plagiarism import google_search, months, domain_list, build_date
22
+ from humanize import paraphrase_text, device
23
+ from ai_generate import generate
24
 
 
 
25
  print(f"Using device: {device}")
26
 
27
  models = {
 
68
  Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
69
  {text}
70
  """
71
+ corrected_text = generate(prompt, "Llama 3", None)
72
  return clean_text(corrected_text)
73
 
74
 
 
290
  conclusion_type: str,
291
  ai_model: str,
292
  content_string: str,
293
+ # api_key: str = None,
294
  pdf_file_input=None,
295
  generated_article: str = None,
296
  user_comments: str = None,
 
320
  prompt = generate_prompt(settings)
321
 
322
  print(prompt)
323
+ # TODO: Why do we need this ??
324
+ if ai_model in ["OpenAI GPT 3.5"]:
325
  response = openai.ChatCompletion.create(
326
  model="gpt-4" if ai_model == "OpenAI GPT 4" else "gpt-3.5-turbo",
327
  messages=[
 
338
  )
339
  article = response.choices[0].message.content.strip()
340
  else:
341
+ article = generate(
342
+ prompt,
343
+ ai_model,
344
+ pdf_file_input, # api_key
345
+ )
346
 
347
  return clean_text(article)
348
 
 
415
  num_examples,
416
  conclusion_type,
417
  ai_model,
418
+ # api_key,
419
  google_search_check,
420
  year_from,
421
  month_from,
 
424
  month_to,
425
  day_to,
426
  domains_to_include,
427
+ include_sites,
428
+ exclude_sites,
429
  pdf_file_input,
430
  generated_article: str = None,
431
  user_comments: str = None,
 
435
  sorted_date = f"date:r:{date_from}:{date_to}"
436
  content_string = ""
437
  final_query = topic
438
+ if include_sites:
439
+ site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
440
+ final_query += " " + " OR ".join(site_queries)
441
+ if exclude_sites:
442
+ exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
443
+ final_query += " " + " ".join(exclude_queries)
444
+ print(f"Final Query: {final_query}")
445
+
446
  if google_search_check:
447
  url_content = google_search(final_query, sorted_date, domains_to_include)
448
  content_string = "\n".join(
 
465
  conclusion_type,
466
  ai_model,
467
  content_string,
468
+ # api_key,
469
  pdf_file_input,
470
  generated_article,
471
  user_comments,
 
514
  "Research paper",
515
  "News article",
516
  "White paper",
517
+ "LinkedIn post",
518
+ "X (Twitter) post",
519
+ "Instagram Video Content",
520
+ "TikTok Video Content",
521
+ "Facebook post",
522
  ],
523
  value="Article",
524
  label="Format",
 
620
  with gr.Row():
621
  google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=True)
622
  with gr.Group(visible=True) as search_options:
623
+ with gr.Row():
624
+ include_sites = gr.Textbox(
625
+ label="Include Specific Websites",
626
+ placeholder="Enter comma-separated keywords",
627
+ elem_classes="input-highlight-yellow",
628
+ )
629
+ with gr.Row():
630
+ exclude_sites = gr.Textbox(
631
+ label="Exclude Specific Websites",
632
+ placeholder="Enter comma-separated keywords",
633
+ elem_classes="input-highlight-yellow",
634
+ )
635
+ with gr.Row():
636
+ domains_to_include = gr.Dropdown(
637
+ domain_list,
638
+ value=domain_list,
639
+ multiselect=True,
640
+ label="Domains To Include",
641
+ )
642
  with gr.Row():
643
  month_from = gr.Dropdown(
644
  choices=months,
 
659
  day_to = gr.Textbox(label="To Day", value=d1[0])
660
  year_to = gr.Textbox(label="To Year", value=d1[2])
661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
663
  pdf_file_input = gr.File(label="Upload PDF")
664
 
665
  with gr.Group():
666
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
667
  ai_generator = gr.Dropdown(
668
+ choices=[
669
+ "OpenAI GPT 4",
670
+ "OpenAI GPT 4o",
671
+ "OpenAI GPT 4o Mini",
672
+ "Claude Sonnet 3.5",
673
+ "Gemini 1.5 Pro",
674
+ "LLaMA 3",
675
+ ],
676
+ value="OpenAI GPT 4o Mini",
677
  label="AI Model",
678
  elem_classes="input-highlight-pink",
679
  )
680
+ # input_api = gr.Textbox(label="API Key", visible=False)
681
+ # ai_generator.change(update_visibility_api, ai_generator, input_api)
682
 
683
  generate_btn = gr.Button("Generate Article", variant="primary")
684
 
 
769
  input_num_examples,
770
  input_conclusion,
771
  ai_generator,
772
+ # input_api,
773
  google_search_check,
774
  year_from,
775
  month_from,
 
778
  month_to,
779
  day_to,
780
  domains_to_include,
781
+ include_sites,
782
+ exclude_sites,
783
  pdf_file_input,
784
  ],
785
  outputs=[output_article],
 
802
  input_num_examples,
803
  input_conclusion,
804
  ai_generator,
805
+ # input_api,
806
  google_search_check,
807
  year_from,
808
  month_from,
 
813
  domains_to_include,
814
  pdf_file_input,
815
  output_article,
816
+ include_sites,
817
+ exclude_sites,
818
  ai_comments,
819
  ],
820
  outputs=[output_article],
nohup.out DELETED
The diff for this file is too large to render. See raw diff
 
plagiarism.py CHANGED
@@ -114,6 +114,7 @@ def google_search(
114
  api_key,
115
  cse_id,
116
  )
 
117
  print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
118
  # Scrape URLs in list
119
  start_time = time.perf_counter()
 
114
  api_key,
115
  cse_id,
116
  )
117
+ print("URLS: ", url_list)
118
  print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
119
  # Scrape URLs in list
120
  start_time = time.perf_counter()
requirements.txt CHANGED
@@ -19,4 +19,6 @@ sentence-transformers
19
  langchain-community
20
  pymupdf
21
  chromadb
22
- language-tool-python
 
 
 
19
  langchain-community
20
  pymupdf
21
  chromadb
22
+ language-tool-python
23
+ anthropic
24
+ google-generativeai