minko186 commited on
Commit
5c0617b
·
2 Parent(s): 3d3f6ce c85110b

Merge branch 'minko'

Browse files
Files changed (3) hide show
  1. app.py +80 -2
  2. plagiarism.py +127 -0
  3. requirements.txt +5 -1
app.py CHANGED
@@ -14,6 +14,8 @@ from scipy.special import softmax
14
  from collections import defaultdict
15
  import nltk
16
  from utils import remove_special_characters
 
 
17
 
18
  # Check if CUDA is available
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -216,6 +218,10 @@ def ai_check(text: str, option: str):
216
 
217
 
218
  def generate_prompt(settings: Dict[str, str]) -> str:
 
 
 
 
219
  prompt = f"""
220
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
221
 
@@ -237,6 +243,9 @@ def generate_prompt(settings: Dict[str, str]) -> str:
237
  - End with a {settings['conclusion_type']} conclusion
238
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
239
  - Do not make any headline, title bold.
 
 
 
240
 
241
  Ensure proper paragraph breaks for better readability.
242
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -245,6 +254,10 @@ def generate_prompt(settings: Dict[str, str]) -> str:
245
 
246
 
247
  def regenerate_prompt(settings: Dict[str, str]) -> str:
 
 
 
 
248
  prompt = f"""
249
  "{settings['generated_article']}"
250
 
@@ -255,6 +268,8 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
255
  - The original content should not be changed. Make minor modifications based on user comments above.
256
  - Keep the references the same as the given text in the same format.
257
  - Do not make any headline, title bold.
 
 
258
 
259
  Ensure proper paragraph breaks for better readability.
260
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -276,10 +291,14 @@ def generate_article(
276
  num_examples: str,
277
  conclusion_type: str,
278
  ai_model: str,
 
 
279
  api_key: str = None,
280
  generated_article: str = None,
281
  user_comments: str = None,
282
  ) -> str:
 
 
283
  settings = {
284
  "topic": topic,
285
  "keywords": [k.strip() for k in keywords.split(",")],
@@ -293,6 +312,7 @@ def generate_article(
293
  "references": [r.strip() for r in references.split(",")],
294
  "num_examples": num_examples,
295
  "conclusion_type": conclusion_type,
 
296
  "generated_article": generated_article,
297
  "user_comments": user_comments,
298
  }
@@ -389,9 +409,19 @@ def generate_and_format(
389
  conclusion_type,
390
  ai_model,
391
  api_key,
 
 
 
 
 
 
 
392
  generated_article: str = None,
393
  user_comments: str = None,
394
  ):
 
 
 
395
  article = generate_article(
396
  topic,
397
  keywords,
@@ -407,6 +437,8 @@ def generate_and_format(
407
  conclusion_type,
408
  ai_model,
409
  api_key,
 
 
410
  generated_article,
411
  user_comments,
412
  )
@@ -422,6 +454,10 @@ def create_interface():
422
  .input-highlight-pink block_label {background-color: #008080}
423
  """,
424
  ) as demo:
 
 
 
 
425
  gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
426
 
427
  with gr.Row():
@@ -459,7 +495,7 @@ def create_interface():
459
  minimum=50,
460
  maximum=5000,
461
  step=50,
462
- value=1000,
463
  label="Article Length",
464
  elem_classes="input-highlight-pink",
465
  )
@@ -542,10 +578,38 @@ def create_interface():
542
  )
543
  input_conclusion = gr.Dropdown(
544
  choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
545
- value="Summary",
546
  label="Conclusion Type",
547
  elem_classes="input-highlight-turquoise",
548
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
 
550
  with gr.Group():
551
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
@@ -640,6 +704,13 @@ def create_interface():
640
  input_conclusion,
641
  ai_generator,
642
  input_api,
 
 
 
 
 
 
 
643
  ],
644
  outputs=[output_article],
645
  )
@@ -661,6 +732,13 @@ def create_interface():
661
  input_conclusion,
662
  ai_generator,
663
  input_api,
 
 
 
 
 
 
 
664
  output_article,
665
  ai_comments,
666
  ],
 
14
  from collections import defaultdict
15
  import nltk
16
  from utils import remove_special_characters
17
+ from plagiarism import google_search, months, domain_list, build_date
18
+ from datetime import date
19
 
20
  # Check if CUDA is available
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
218
 
219
 
220
  def generate_prompt(settings: Dict[str, str]) -> str:
221
+ content_string = "\n".join(
222
+ f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
223
+ )
224
+
225
  prompt = f"""
226
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
227
 
 
243
  - End with a {settings['conclusion_type']} conclusion
244
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
245
  - Do not make any headline, title bold.
246
+
247
+ Use the content here from the URLs I've found for you:
248
+ {content_string}
249
 
250
  Ensure proper paragraph breaks for better readability.
251
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
254
 
255
 
256
  def regenerate_prompt(settings: Dict[str, str]) -> str:
257
+ content_string = "\n".join(
258
+ f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
259
+ )
260
+
261
  prompt = f"""
262
  "{settings['generated_article']}"
263
 
 
268
  - The original content should not be changed. Make minor modifications based on user comments above.
269
  - Keep the references the same as the given text in the same format.
270
  - Do not make any headline, title bold.
271
+ Use the content here from the URLs I've found for you:
272
+ {content_string}
273
 
274
  Ensure proper paragraph breaks for better readability.
275
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
291
  num_examples: str,
292
  conclusion_type: str,
293
  ai_model: str,
294
+ sorted_date,
295
+ domains_to_skip,
296
  api_key: str = None,
297
  generated_article: str = None,
298
  user_comments: str = None,
299
  ) -> str:
300
+
301
+ url_content = google_search(topic, sorted_date, domains_to_skip)
302
  settings = {
303
  "topic": topic,
304
  "keywords": [k.strip() for k in keywords.split(",")],
 
312
  "references": [r.strip() for r in references.split(",")],
313
  "num_examples": num_examples,
314
  "conclusion_type": conclusion_type,
315
+ "sources": url_content,
316
  "generated_article": generated_article,
317
  "user_comments": user_comments,
318
  }
 
409
  conclusion_type,
410
  ai_model,
411
  api_key,
412
+ year_from,
413
+ month_from,
414
+ day_from,
415
+ year_to,
416
+ month_to,
417
+ day_to,
418
+ domains_to_skip,
419
  generated_article: str = None,
420
  user_comments: str = None,
421
  ):
422
+ date_from = build_date(year_from, month_from, day_from)
423
+ date_to = build_date(year_to, month_to, day_to)
424
+ sorted_date = f"date:r:{date_from}:{date_to}"
425
  article = generate_article(
426
  topic,
427
  keywords,
 
437
  conclusion_type,
438
  ai_model,
439
  api_key,
440
+ sorted_date,
441
+ domains_to_skip,
442
  generated_article,
443
  user_comments,
444
  )
 
454
  .input-highlight-pink block_label {background-color: #008080}
455
  """,
456
  ) as demo:
457
+ today = date.today()
458
+ # dd/mm/YY
459
+ d1 = today.strftime("%d/%B/%Y")
460
+ d1 = d1.split("/")
461
  gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
462
 
463
  with gr.Row():
 
495
  minimum=50,
496
  maximum=5000,
497
  step=50,
498
+ value=300,
499
  label="Article Length",
500
  elem_classes="input-highlight-pink",
501
  )
 
578
  )
579
  input_conclusion = gr.Dropdown(
580
  choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
581
+ value="Call to Action",
582
  label="Conclusion Type",
583
  elem_classes="input-highlight-turquoise",
584
  )
585
+ gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
586
+ with gr.Group():
587
+ with gr.Row():
588
+ month_from = gr.Dropdown(
589
+ choices=months,
590
+ label="From Month",
591
+ value="January",
592
+ interactive=True,
593
+ )
594
+ day_from = gr.Textbox(label="From Day", value="01")
595
+ year_from = gr.Textbox(label="From Year", value="2000")
596
+
597
+ with gr.Row():
598
+ month_to = gr.Dropdown(
599
+ choices=months,
600
+ label="To Month",
601
+ value=d1[1],
602
+ interactive=True,
603
+ )
604
+ day_to = gr.Textbox(label="To Day", value=d1[0])
605
+ year_to = gr.Textbox(label="To Year", value=d1[2])
606
+
607
+ with gr.Row():
608
+ domains_to_skip = gr.Dropdown(
609
+ domain_list,
610
+ multiselect=True,
611
+ label="Domain To Skip",
612
+ )
613
 
614
  with gr.Group():
615
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
 
704
  input_conclusion,
705
  ai_generator,
706
  input_api,
707
+ year_from,
708
+ month_from,
709
+ day_from,
710
+ year_to,
711
+ month_to,
712
+ day_to,
713
+ domains_to_skip,
714
  ],
715
  outputs=[output_article],
716
  )
 
732
  input_conclusion,
733
  ai_generator,
734
  input_api,
735
+ year_from,
736
+ month_from,
737
+ day_from,
738
+ year_to,
739
+ month_to,
740
+ day_to,
741
+ domains_to_skip,
742
  output_article,
743
  ai_comments,
744
  ],
plagiarism.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from googleapiclient.discovery import build
3
+ import asyncio
4
+ import httpx
5
+ from bs4 import BeautifulSoup
6
+ import justext
7
+ import newspaper
8
+
9
+
10
+ def clean_html(text):
11
+ result = ""
12
+ article = newspaper.Article(url=" ")
13
+ article.set_html(text)
14
+ article.parse()
15
+ result += article.title + "\n"
16
+ paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
+ for paragraph in paragraphs:
18
+ if not paragraph.is_boilerplate:
19
+ result += paragraph.text
20
+ return result
21
+
22
+
23
+ months = {
24
+ "January": "01",
25
+ "February": "02",
26
+ "March": "03",
27
+ "April": "04",
28
+ "May": "05",
29
+ "June": "06",
30
+ "July": "07",
31
+ "August": "08",
32
+ "September": "09",
33
+ "October": "10",
34
+ "November": "11",
35
+ "December": "12",
36
+ }
37
+
38
+ domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
39
+
40
+
41
+ def build_date(year=2024, month="March", day=1):
42
+ return f"{year}{months[month]}{day}"
43
+
44
+
45
+ async def get_url_data(url, client):
46
+ try:
47
+ r = await client.get(url)
48
+ if r.status_code == 200:
49
+ soup = BeautifulSoup(r.content, "html.parser")
50
+ return soup
51
+ except Exception:
52
+ return None
53
+
54
+
55
+ async def parallel_scrap(urls):
56
+ async with httpx.AsyncClient(timeout=30) as client:
57
+ tasks = []
58
+ for url in urls:
59
+ tasks.append(get_url_data(url=url, client=client))
60
+ results = await asyncio.gather(*tasks, return_exceptions=True)
61
+ return results
62
+
63
+
64
+ def google_search_urls(
65
+ text,
66
+ sorted_date,
67
+ domains_to_skip,
68
+ api_key,
69
+ cse_id,
70
+ **kwargs,
71
+ ):
72
+ service = build("customsearch", "v1", developerKey=api_key)
73
+ results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
74
+ url_list = []
75
+ if "items" in results and len(results["items"]) > 0:
76
+ for count, link in enumerate(results["items"]):
77
+ # skip user selected domains
78
+ if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
79
+ continue
80
+ url = link["link"]
81
+ if url not in url_list:
82
+ url_list.append(url)
83
+ return url_list
84
+
85
+
86
+ def google_search(
87
+ input,
88
+ sorted_date,
89
+ domains_to_skip,
90
+ ):
91
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
92
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
93
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
94
+ # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
95
+ # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
96
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
97
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
98
+ cse_id = "851813e81162b4ed4"
99
+
100
+ # get list of URLS to check
101
+ start_time = time.perf_counter()
102
+ url_list = google_search_urls(
103
+ input,
104
+ sorted_date,
105
+ domains_to_skip,
106
+ api_key,
107
+ cse_id,
108
+ )
109
+ print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
110
+ # Scrape URLs in list
111
+ start_time = time.perf_counter()
112
+ soups = asyncio.run(parallel_scrap(url_list))
113
+ print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
114
+ result_content = {}
115
+ num_pages = 3
116
+ count = 0
117
+ for url, soup in zip(url_list, soups):
118
+ if count >= num_pages:
119
+ break
120
+ if soup:
121
+ text = clean_html(soup.text)
122
+ result_content[url] = text
123
+ count += 1
124
+ # for key, value in result_content.items():
125
+ # print("-------------------URL: ", key)
126
+ # print(value[:30])
127
+ return result_content
requirements.txt CHANGED
@@ -8,4 +8,8 @@ openai
8
  groq
9
  language_tool_python
10
  scipy
11
- Unidecode
 
 
 
 
 
8
  groq
9
  language_tool_python
10
  scipy
11
+ Unidecode
12
+ BeautifulSoup4
13
+ google-api-python-client
14
+ newspaper3k
15
+ jusText