Spaces:
Runtime error
Runtime error
Commit
·
34b1950
1
Parent(s):
a97d561
enabled RAG for all LLMs + prompt improvements
Browse files- ai_generate.py +52 -26
- app.py +78 -68
- plagiarism.py +5 -3
- requirements.txt +3 -0
ai_generate.py
CHANGED
@@ -21,6 +21,9 @@ from langchain_core.output_parsers import StrOutputParser
|
|
21 |
from langchain_core.runnables import RunnablePassthrough
|
22 |
from langchain.chains import RetrievalQA
|
23 |
from langchain_groq import ChatGroq
|
|
|
|
|
|
|
24 |
from dotenv import load_dotenv
|
25 |
|
26 |
load_dotenv()
|
@@ -40,6 +43,31 @@ gemini_client = GenerativeModel("gemini-1.5-pro-001")
|
|
40 |
claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
|
41 |
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
def create_db_with_langchain(path):
|
44 |
loader = PyMuPDFLoader(path)
|
45 |
data = loader.load()
|
@@ -55,11 +83,9 @@ def create_db_with_langchain(path):
|
|
55 |
return db
|
56 |
|
57 |
|
58 |
-
def
|
59 |
-
|
60 |
-
|
61 |
-
model_name=model,
|
62 |
-
)
|
63 |
db = create_db_with_langchain(path)
|
64 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20})
|
65 |
prompt = hub.pull("rlm/rag-prompt")
|
@@ -71,7 +97,7 @@ def generate_groq_rag(text, model, path):
|
|
71 |
return rag_chain.invoke(text).content
|
72 |
|
73 |
|
74 |
-
def
|
75 |
completion = groq_client.chat.completions.create(
|
76 |
model=model,
|
77 |
messages=[
|
@@ -93,13 +119,6 @@ def generate_groq_base(text, model):
|
|
93 |
return response
|
94 |
|
95 |
|
96 |
-
def generate_groq(text, model, path):
|
97 |
-
if path:
|
98 |
-
return generate_groq_rag(text, model, path)
|
99 |
-
else:
|
100 |
-
return generate_groq_base(text, model)
|
101 |
-
|
102 |
-
|
103 |
def generate_openai(text, model, openai_client):
|
104 |
message = [{"role": "user", "content": text}]
|
105 |
response = openai_client.chat.completions.create(
|
@@ -144,16 +163,23 @@ def generate_claude(text, model, claude_client):
|
|
144 |
|
145 |
|
146 |
def generate(text, model, path, api=None):
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
from langchain_core.runnables import RunnablePassthrough
|
22 |
from langchain.chains import RetrievalQA
|
23 |
from langchain_groq import ChatGroq
|
24 |
+
from langchain_openai import ChatOpenAI
|
25 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
26 |
+
from langchain_anthropic import ChatAnthropic
|
27 |
from dotenv import load_dotenv
|
28 |
|
29 |
load_dotenv()
|
|
|
43 |
claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
|
44 |
|
45 |
|
46 |
+
rag_llms = {
|
47 |
+
"LLaMA 3": ChatGroq(
|
48 |
+
temperature=0,
|
49 |
+
model_name="llama3-70b-8192",
|
50 |
+
),
|
51 |
+
"OpenAI GPT 4o Mini": ChatOpenAI(
|
52 |
+
temperature=0,
|
53 |
+
model_name="gpt-4o-mini",
|
54 |
+
),
|
55 |
+
"OpenAI GPT 4o": ChatOpenAI(
|
56 |
+
temperature=0,
|
57 |
+
model_name="gpt-4o",
|
58 |
+
),
|
59 |
+
"OpenAI GPT 4": ChatOpenAI(
|
60 |
+
temperature=0,
|
61 |
+
model_name="gpt-4-turbo",
|
62 |
+
),
|
63 |
+
"Gemini 1.5 Pro": ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-pro"),
|
64 |
+
"Claude Sonnet 3.5": ChatAnthropic(
|
65 |
+
temperature=0,
|
66 |
+
model_name="claude-3-5-sonnet-20240620",
|
67 |
+
),
|
68 |
+
}
|
69 |
+
|
70 |
+
|
71 |
def create_db_with_langchain(path):
|
72 |
loader = PyMuPDFLoader(path)
|
73 |
data = loader.load()
|
|
|
83 |
return db
|
84 |
|
85 |
|
86 |
+
def generate_rag(text, model, path):
|
87 |
+
print(f"Generating text using RAG for {model}...")
|
88 |
+
llm = rag_llms[model]
|
|
|
|
|
89 |
db = create_db_with_langchain(path)
|
90 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20})
|
91 |
prompt = hub.pull("rlm/rag-prompt")
|
|
|
97 |
return rag_chain.invoke(text).content
|
98 |
|
99 |
|
100 |
+
def generate_groq(text, model):
|
101 |
completion = groq_client.chat.completions.create(
|
102 |
model=model,
|
103 |
messages=[
|
|
|
119 |
return response
|
120 |
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
def generate_openai(text, model, openai_client):
|
123 |
message = [{"role": "user", "content": text}]
|
124 |
response = openai_client.chat.completions.create(
|
|
|
163 |
|
164 |
|
165 |
def generate(text, model, path, api=None):
|
166 |
+
if path:
|
167 |
+
result = generate_rag(text, model, path)
|
168 |
+
if "references" not in result.lower():
|
169 |
+
result += "\n\n" + "References:"
|
170 |
+
result += "\n\n" + f"{path}"
|
171 |
+
return result
|
172 |
+
else:
|
173 |
+
print(f"Generating text for {model}...")
|
174 |
+
if model == "LLaMA 3":
|
175 |
+
return generate_groq(text, "llama3-70b-8192")
|
176 |
+
elif model == "OpenAI GPT 4o Mini":
|
177 |
+
return generate_openai(text, "gpt-4o-mini", openai_client)
|
178 |
+
elif model == "OpenAI GPT 4o":
|
179 |
+
return generate_openai(text, "gpt-4o", openai_client)
|
180 |
+
elif model == "OpenAI GPT 4":
|
181 |
+
return generate_openai(text, "gpt-4-turbo", openai_client)
|
182 |
+
elif model == "Gemini 1.5 Pro":
|
183 |
+
return generate_gemini(text, "", gemini_client)
|
184 |
+
elif model == "Claude Sonnet 3.5":
|
185 |
+
return generate_claude(text, "claude-3-5-sonnet-20240620", claude_client)
|
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
"""
|
2 |
nohup python3 app.py &
|
3 |
"""
|
|
|
4 |
import openai
|
5 |
import gradio as gr
|
6 |
from typing import Dict, List
|
@@ -62,23 +63,41 @@ def clean_text(text: str) -> str:
|
|
62 |
return "\n".join(cleaned_paragraphs)
|
63 |
|
64 |
|
65 |
-
def
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
""
|
70 |
-
|
71 |
-
return clean_text(corrected_text)
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
|
84 |
def format_and_correct_language_check(text: str) -> str:
|
@@ -108,7 +127,7 @@ def ai_generated_test(text, model="BC Original"):
|
|
108 |
return predict(models[model], tokenizers[model], text)
|
109 |
|
110 |
|
111 |
-
def
|
112 |
# sentences = split_into_sentences(text)
|
113 |
sentences = nltk.sent_tokenize(text)
|
114 |
num_sentences = len(sentences)
|
@@ -145,12 +164,11 @@ def process_text(text, model="BC Original"):
|
|
145 |
overall_scores.append(avg_score)
|
146 |
i = i + 1
|
147 |
combined_sentences = " ".join(colored_sentences)
|
148 |
-
print(combined_sentences)
|
149 |
colored_paragraphs.append(combined_sentences)
|
150 |
|
151 |
overall_score = sum(overall_scores) / len(overall_scores)
|
152 |
overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
|
153 |
-
return overall_score,
|
154 |
|
155 |
|
156 |
ai_check_options = [
|
@@ -201,12 +219,14 @@ class GPT2PPL:
|
|
201 |
def ai_generated_test_gptzero(text):
|
202 |
gptzero_model = GPT2PPL()
|
203 |
result = gptzero_model(text)
|
204 |
-
print(result)
|
205 |
return result, None
|
206 |
|
207 |
|
208 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
209 |
-
|
|
|
|
|
|
|
210 |
|
211 |
|
212 |
def ai_check(text: str, option: str):
|
@@ -223,7 +243,6 @@ def ai_check(text: str, option: str):
|
|
223 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
224 |
prompt = f"""
|
225 |
I am a {settings['role']}
|
226 |
-
|
227 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
|
228 |
|
229 |
Style and Tone:
|
@@ -239,10 +258,11 @@ def generate_prompt(settings: Dict[str, str]) -> str:
|
|
239 |
{', '.join(settings['keywords'])}
|
240 |
|
241 |
Additional requirements:
|
|
|
242 |
- Include {settings['num_examples']} relevant examples or case studies
|
243 |
- Incorporate data or statistics from {', '.join(settings['references'])}
|
244 |
- End with a {settings['conclusion_type']} conclusion
|
245 |
-
- Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
|
246 |
- Do not make any headline, title bold.
|
247 |
{settings['sources']}
|
248 |
|
@@ -255,12 +275,11 @@ def generate_prompt(settings: Dict[str, str]) -> str:
|
|
255 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
256 |
prompt = f"""
|
257 |
I am a {settings['role']}
|
258 |
-
|
259 |
"{settings['generated_article']}"
|
260 |
-
|
261 |
Edit the given text based on user comments.
|
262 |
|
263 |
Comments:
|
|
|
264 |
- {settings['user_comments']}
|
265 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
266 |
- Keep the references the same as the given text in the same format.
|
@@ -318,30 +337,12 @@ def generate_article(
|
|
318 |
else:
|
319 |
prompt = generate_prompt(settings)
|
320 |
|
321 |
-
print(prompt)
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
{
|
328 |
-
"role": "system",
|
329 |
-
"content": "You are a professional content writer with expertise in various fields.",
|
330 |
-
},
|
331 |
-
{"role": "user", "content": prompt},
|
332 |
-
],
|
333 |
-
max_tokens=3000,
|
334 |
-
n=1,
|
335 |
-
stop=None,
|
336 |
-
temperature=0.7,
|
337 |
-
)
|
338 |
-
article = response.choices[0].message.content.strip()
|
339 |
-
else:
|
340 |
-
article = generate(
|
341 |
-
prompt,
|
342 |
-
ai_model,
|
343 |
-
pdf_file_input, # api_key
|
344 |
-
)
|
345 |
|
346 |
return clean_text(article)
|
347 |
|
@@ -354,14 +355,16 @@ def humanize(
|
|
354 |
top_k: int = 50,
|
355 |
length_penalty: float = 1,
|
356 |
) -> str:
|
|
|
357 |
result = paraphrase_text(
|
358 |
-
text=
|
359 |
model_name=model,
|
360 |
temperature=temperature,
|
361 |
repetition_penalty=repetition_penalty,
|
362 |
top_k=top_k,
|
363 |
length_penalty=length_penalty,
|
364 |
)
|
|
|
365 |
return format_and_correct_language_check(result)
|
366 |
|
367 |
|
@@ -376,18 +379,20 @@ def format_references(text: str) -> str:
|
|
376 |
lines = text.split("\n")
|
377 |
references = []
|
378 |
article_text = []
|
|
|
379 |
in_references = False
|
380 |
|
381 |
for line in lines:
|
382 |
-
if (
|
383 |
-
line.strip().lower() == "references"
|
384 |
-
or line.strip().lower() == "references:"
|
385 |
-
or line.strip().lower().startswith("references:")
|
386 |
-
):
|
387 |
in_references = True
|
388 |
continue
|
|
|
|
|
389 |
if in_references:
|
390 |
-
|
|
|
|
|
|
|
391 |
else:
|
392 |
article_text.append(line)
|
393 |
|
@@ -429,25 +434,26 @@ def generate_and_format(
|
|
429 |
generated_article: str = None,
|
430 |
user_comments: str = None,
|
431 |
):
|
432 |
-
date_from = build_date(year_from, month_from, day_from)
|
433 |
-
date_to = build_date(year_to, month_to, day_to)
|
434 |
-
sorted_date = f"date:r:{date_from}:{date_to}"
|
435 |
content_string = ""
|
436 |
-
final_query = topic
|
437 |
-
if include_sites:
|
438 |
-
site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
|
439 |
-
final_query += " " + " OR ".join(site_queries)
|
440 |
-
if exclude_sites:
|
441 |
-
exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
|
442 |
-
final_query += " " + " ".join(exclude_queries)
|
443 |
-
print(f"Final Query: {final_query}")
|
444 |
-
|
445 |
if google_search_check:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
446 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
447 |
content_string = "\n".join(
|
448 |
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
|
449 |
)
|
450 |
-
content_string =
|
|
|
|
|
451 |
article = generate_article(
|
452 |
input_role,
|
453 |
topic,
|
@@ -469,6 +475,10 @@ def generate_and_format(
|
|
469 |
generated_article,
|
470 |
user_comments,
|
471 |
)
|
|
|
|
|
|
|
|
|
472 |
return format_references(article)
|
473 |
|
474 |
|
|
|
1 |
"""
|
2 |
nohup python3 app.py &
|
3 |
"""
|
4 |
+
|
5 |
import openai
|
6 |
import gradio as gr
|
7 |
from typing import Dict, List
|
|
|
63 |
return "\n".join(cleaned_paragraphs)
|
64 |
|
65 |
|
66 |
+
def split_text_from_refs(text: str, sep="\n"):
|
67 |
+
lines = text.split("\n")
|
68 |
+
references = []
|
69 |
+
article_text = []
|
70 |
+
index_pattern = re.compile(r"\[(\d+)\]")
|
71 |
+
in_references = False
|
|
|
72 |
|
73 |
+
for line in lines:
|
74 |
+
if line.strip().lower() == "references" or line.strip().lower() == "references:":
|
75 |
+
in_references = True
|
76 |
+
continue
|
77 |
+
if line.strip().lower().startswith("references:"):
|
78 |
+
in_references = True
|
79 |
+
if in_references:
|
80 |
+
matches = index_pattern.split(line)
|
81 |
+
for match in matches:
|
82 |
+
if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
|
83 |
+
references.append(match.strip())
|
84 |
+
else:
|
85 |
+
article_text.append(line)
|
86 |
|
87 |
+
formatted_refs = []
|
88 |
+
for i, ref in enumerate(references, 1):
|
89 |
+
ref = remove_bracketed_numbers(ref)
|
90 |
+
formatted_refs.append(f"[{i}] {ref}{sep}")
|
91 |
+
|
92 |
+
return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
|
93 |
+
|
94 |
+
|
95 |
+
def ends_with_references(text):
|
96 |
+
# Define a regular expression pattern for variations of "References:"
|
97 |
+
pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE)
|
98 |
+
|
99 |
+
# Check if the text ends with any form of "References:"
|
100 |
+
return bool(pattern.search(text.strip()))
|
101 |
|
102 |
|
103 |
def format_and_correct_language_check(text: str) -> str:
|
|
|
127 |
return predict(models[model], tokenizers[model], text)
|
128 |
|
129 |
|
130 |
+
def detection_polygraf(text, model="BC Original"):
|
131 |
# sentences = split_into_sentences(text)
|
132 |
sentences = nltk.sent_tokenize(text)
|
133 |
num_sentences = len(sentences)
|
|
|
164 |
overall_scores.append(avg_score)
|
165 |
i = i + 1
|
166 |
combined_sentences = " ".join(colored_sentences)
|
|
|
167 |
colored_paragraphs.append(combined_sentences)
|
168 |
|
169 |
overall_score = sum(overall_scores) / len(overall_scores)
|
170 |
overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
|
171 |
+
return overall_score, "<br><br>".join(colored_paragraphs)
|
172 |
|
173 |
|
174 |
ai_check_options = [
|
|
|
219 |
def ai_generated_test_gptzero(text):
|
220 |
gptzero_model = GPT2PPL()
|
221 |
result = gptzero_model(text)
|
|
|
222 |
return result, None
|
223 |
|
224 |
|
225 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
226 |
+
body, references = split_text_from_refs(text, "<br>")
|
227 |
+
score, text = detection_polygraf(text=body, model=model)
|
228 |
+
text = text + "<br>" + references
|
229 |
+
return score, text
|
230 |
|
231 |
|
232 |
def ai_check(text: str, option: str):
|
|
|
243 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
244 |
prompt = f"""
|
245 |
I am a {settings['role']}
|
|
|
246 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
|
247 |
|
248 |
Style and Tone:
|
|
|
258 |
{', '.join(settings['keywords'])}
|
259 |
|
260 |
Additional requirements:
|
261 |
+
- Don't start with "Here is a...", start with the requested text directly
|
262 |
- Include {settings['num_examples']} relevant examples or case studies
|
263 |
- Incorporate data or statistics from {', '.join(settings['references'])}
|
264 |
- End with a {settings['conclusion_type']} conclusion
|
265 |
+
- Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
|
266 |
- Do not make any headline, title bold.
|
267 |
{settings['sources']}
|
268 |
|
|
|
275 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
276 |
prompt = f"""
|
277 |
I am a {settings['role']}
|
|
|
278 |
"{settings['generated_article']}"
|
|
|
279 |
Edit the given text based on user comments.
|
280 |
|
281 |
Comments:
|
282 |
+
- Don't start with "Here is a...", start with the requested text directly
|
283 |
- {settings['user_comments']}
|
284 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
285 |
- Keep the references the same as the given text in the same format.
|
|
|
337 |
else:
|
338 |
prompt = generate_prompt(settings)
|
339 |
|
340 |
+
print("Generated Prompt...\n", prompt)
|
341 |
+
article = generate(
|
342 |
+
prompt,
|
343 |
+
ai_model,
|
344 |
+
pdf_file_input, # api_key
|
345 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
return clean_text(article)
|
348 |
|
|
|
355 |
top_k: int = 50,
|
356 |
length_penalty: float = 1,
|
357 |
) -> str:
|
358 |
+
body, references = split_text_from_refs(text)
|
359 |
result = paraphrase_text(
|
360 |
+
text=body,
|
361 |
model_name=model,
|
362 |
temperature=temperature,
|
363 |
repetition_penalty=repetition_penalty,
|
364 |
top_k=top_k,
|
365 |
length_penalty=length_penalty,
|
366 |
)
|
367 |
+
result = result + "\n\n" + references
|
368 |
return format_and_correct_language_check(result)
|
369 |
|
370 |
|
|
|
379 |
lines = text.split("\n")
|
380 |
references = []
|
381 |
article_text = []
|
382 |
+
index_pattern = re.compile(r"\[(\d+)\]")
|
383 |
in_references = False
|
384 |
|
385 |
for line in lines:
|
386 |
+
if line.strip().lower() == "references" or line.strip().lower() == "references:":
|
|
|
|
|
|
|
|
|
387 |
in_references = True
|
388 |
continue
|
389 |
+
if line.strip().lower().startswith("references:"):
|
390 |
+
in_references = True
|
391 |
if in_references:
|
392 |
+
matches = index_pattern.split(line)
|
393 |
+
for match in matches:
|
394 |
+
if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
|
395 |
+
references.append(match.strip())
|
396 |
else:
|
397 |
article_text.append(line)
|
398 |
|
|
|
434 |
generated_article: str = None,
|
435 |
user_comments: str = None,
|
436 |
):
|
|
|
|
|
|
|
437 |
content_string = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
if google_search_check:
|
439 |
+
date_from = build_date(year_from, month_from, day_from)
|
440 |
+
date_to = build_date(year_to, month_to, day_to)
|
441 |
+
sorted_date = f"date:r:{date_from}:{date_to}"
|
442 |
+
final_query = topic
|
443 |
+
if include_sites:
|
444 |
+
site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
|
445 |
+
final_query += " " + " OR ".join(site_queries)
|
446 |
+
if exclude_sites:
|
447 |
+
exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")]
|
448 |
+
final_query += " " + " ".join(exclude_queries)
|
449 |
+
print(f"Google Search Query: {final_query}")
|
450 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
451 |
content_string = "\n".join(
|
452 |
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
|
453 |
)
|
454 |
+
content_string = (
|
455 |
+
"Use the trusted information here from the URLs and add them as References:\n" + content_string
|
456 |
+
)
|
457 |
article = generate_article(
|
458 |
input_role,
|
459 |
topic,
|
|
|
475 |
generated_article,
|
476 |
user_comments,
|
477 |
)
|
478 |
+
if ends_with_references(article) and url_content is not None:
|
479 |
+
for url in url_content.keys():
|
480 |
+
article += f"\n{url}"
|
481 |
+
|
482 |
return format_references(article)
|
483 |
|
484 |
|
plagiarism.py
CHANGED
@@ -15,7 +15,8 @@ def clean_html(text):
|
|
15 |
result += article.title + "\n"
|
16 |
paragraphs = justext.justext(text, justext.get_stoplist("English"))
|
17 |
for paragraph in paragraphs:
|
18 |
-
|
|
|
19 |
return result
|
20 |
|
21 |
|
@@ -128,6 +129,7 @@ def google_search(
|
|
128 |
break
|
129 |
if soup:
|
130 |
text = clean_html(soup.text)
|
131 |
-
|
132 |
-
|
|
|
133 |
return result_content
|
|
|
15 |
result += article.title + "\n"
|
16 |
paragraphs = justext.justext(text, justext.get_stoplist("English"))
|
17 |
for paragraph in paragraphs:
|
18 |
+
if not paragraph.is_boilerplate:
|
19 |
+
result += paragraph.text
|
20 |
return result
|
21 |
|
22 |
|
|
|
129 |
break
|
130 |
if soup:
|
131 |
text = clean_html(soup.text)
|
132 |
+
if len(text) > 500:
|
133 |
+
result_content[url] = text
|
134 |
+
count += 1
|
135 |
return result_content
|
requirements.txt
CHANGED
@@ -22,4 +22,7 @@ chromadb
|
|
22 |
language-tool-python
|
23 |
anthropic
|
24 |
google-generativeai
|
|
|
|
|
|
|
25 |
vertexai
|
|
|
22 |
language-tool-python
|
23 |
anthropic
|
24 |
google-generativeai
|
25 |
+
langchain-google-genai
|
26 |
+
langchain-anthropic
|
27 |
+
langchain-openai
|
28 |
vertexai
|