minko186 commited on
Commit
f6b1cb0
·
1 Parent(s): e76dfe8

updates on prompt + better error handling

Browse files
Files changed (2) hide show
  1. ai_generate.py +61 -43
  2. app.py +8 -4
ai_generate.py CHANGED
@@ -17,6 +17,7 @@ from langchain_anthropic import ChatAnthropic
17
  from dotenv import load_dotenv
18
  from langchain_core.output_parsers import XMLOutputParser
19
  from langchain.prompts import ChatPromptTemplate
 
20
 
21
  load_dotenv()
22
 
@@ -51,10 +52,19 @@ llm_classes = {
51
 
52
  xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
53
  of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
54
- introductions or general text), don't put a citation for that chunk and just leave citations empty. Otherwise, \
55
- list all sources used for that chunk of the text. Don't add inline citations in the text itself. Add all citations to the separated \
56
- citations section. Use explicit new lines in the text to show paragraph splits. \
57
- Return a citation for every quote across all articles that justify the text. Use the following format for your final output:
 
 
 
 
 
 
 
 
 
58
  <cited_text>
59
  <chunk>
60
  <text></text>
@@ -95,51 +105,59 @@ def get_doc_content(docs, id):
95
  return docs[id].page_content
96
 
97
 
 
 
 
 
 
 
98
  def process_cited_text(data, docs):
99
  # Initialize variables for the combined text and a dictionary for citations
100
  combined_text = ""
101
  citations = {}
102
  # Iterate through the cited_text list
103
- for item in data['cited_text']:
104
- chunk_text = item['chunk'][0]['text']
105
- combined_text += chunk_text
106
- citation_ids = []
107
- # Process the citations for the chunk
108
- if item['chunk'][1]['citations']:
109
- for c in item['chunk'][1]['citations']:
110
- if c and 'citation' in c:
111
- citation = c['citation']
112
- if isinstance(citation, dict) and "source_id" in citation:
113
- citation = citation['source_id']
114
- if isinstance(citation, str):
115
- try:
116
- citation_ids.append(int(citation))
117
- except ValueError:
118
- pass # Handle cases where the string is not a valid integer
119
- if citation_ids:
120
- citation_texts = [f"<{cid}-{docs[cid].metadata['source']}>" for cid in citation_ids]
121
- combined_text += " " + " ".join(citation_texts)
122
- combined_text += "\n\n"
123
- # Store unique citations in a dictionary
124
- for citation_id in citation_ids:
125
- if citation_id not in citations:
126
- citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
 
127
 
128
  return combined_text.strip(), citations
129
 
130
 
131
  def citations_to_html(citations):
132
- # Generate the HTML for the unique citations
133
- html_content = ""
134
- for citation_id, citation_info in citations.items():
135
- html_content += (
136
- f"<li><strong>Source ID:</strong> {citation_id}<br>"
137
- f"<strong>Path:</strong> {citation_info['source']}<br>"
138
- f"<strong>Page Content:</strong> {citation_info['content']}</li>"
139
- )
140
- html_content += "</ul></body></html>"
141
-
142
- return html_content
 
143
 
144
 
145
  def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
@@ -227,13 +245,13 @@ def generate_base(
227
  llm = load_llm(model, api_key, temperature, max_length)
228
  if llm is None:
229
  print("Failed to load LLM. Aborting operation.")
230
- return None
231
  try:
232
  output = llm.invoke(prompt).content
233
- return output
234
  except Exception as e:
235
  print(f"An error occurred while running the model: {e}")
236
- return None
237
 
238
 
239
  def generate(
@@ -250,4 +268,4 @@ def generate(
250
  if path or url_content:
251
  return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
252
  else:
253
- return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
 
17
  from dotenv import load_dotenv
18
  from langchain_core.output_parsers import XMLOutputParser
19
  from langchain.prompts import ChatPromptTemplate
20
+ import re
21
 
22
  load_dotenv()
23
 
 
52
 
53
  xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
54
  of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
55
+ introductions or general text), don't put a citation for that chunk and just leave "citations" section empty. Otherwise, \
56
+ list all sources used for that chunk of the text. Remember, don't add inline citations in the text itself in any circumstant.
57
+ Add all citations to the separate citations section. Use explicit new lines in the text to show paragraph splits. For each chunk use this example format:
58
+ <chunk>
59
+ <text>This is a sample text chunk....</text>
60
+ <citations>
61
+ <citation>1</citation>
62
+ <citation>3</citation>
63
+ ...
64
+ </citations>
65
+ </chunk>
66
+ If the prompt asks for a reference section, add it in a chunk without any citations
67
+ Return a citation for every quote across all articles that justify the text. Remember use the following format for your final output:
68
  <cited_text>
69
  <chunk>
70
  <text></text>
 
105
  return docs[id].page_content
106
 
107
 
108
+ def remove_citations(text):
109
+ text = re.sub(r'<\d+>', '', text)
110
+ text = re.sub(r'[\d+]', '', text)
111
+ return text
112
+
113
+
114
  def process_cited_text(data, docs):
115
  # Initialize variables for the combined text and a dictionary for citations
116
  combined_text = ""
117
  citations = {}
118
  # Iterate through the cited_text list
119
+ if 'cited_text' in data:
120
+ for item in data['cited_text']:
121
+ chunk_text = item['chunk'][0]['text']
122
+ combined_text += chunk_text
123
+ citation_ids = []
124
+ # Process the citations for the chunk
125
+ if item['chunk'][1]['citations']:
126
+ for c in item['chunk'][1]['citations']:
127
+ if c and 'citation' in c:
128
+ citation = c['citation']
129
+ if isinstance(citation, dict) and "source_id" in citation:
130
+ citation = citation['source_id']
131
+ if isinstance(citation, str):
132
+ try:
133
+ citation_ids.append(int(citation))
134
+ except ValueError:
135
+ pass # Handle cases where the string is not a valid integer
136
+ if citation_ids:
137
+ citation_texts = [f"<{cid}>" for cid in citation_ids]
138
+ combined_text += " " + "".join(citation_texts)
139
+ combined_text += "\n\n"
140
+ # Store unique citations in a dictionary
141
+ for citation_id in citation_ids:
142
+ if citation_id not in citations:
143
+ citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
144
 
145
  return combined_text.strip(), citations
146
 
147
 
148
  def citations_to_html(citations):
149
+ if citations:
150
+ # Generate the HTML for the unique citations
151
+ html_content = ""
152
+ for citation_id, citation_info in citations.items():
153
+ html_content += (
154
+ f"<li><strong>Source ID:</strong> {citation_id}<br>"
155
+ f"<strong>Path:</strong> {citation_info['source']}<br>"
156
+ f"<strong>Page Content:</strong> {citation_info['content']}</li>"
157
+ )
158
+ html_content += "</ul></body></html>"
159
+ return html_content
160
+ return ""
161
 
162
 
163
  def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
 
245
  llm = load_llm(model, api_key, temperature, max_length)
246
  if llm is None:
247
  print("Failed to load LLM. Aborting operation.")
248
+ return None, None
249
  try:
250
  output = llm.invoke(prompt).content
251
+ return output, None
252
  except Exception as e:
253
  print(f"An error occurred while running the model: {e}")
254
+ return None, None
255
 
256
 
257
  def generate(
 
268
  if path or url_content:
269
  return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
270
  else:
271
+ return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
app.py CHANGED
@@ -19,7 +19,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
19
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
20
  from google_search import google_search, months, domain_list, build_date
21
  from humanize import humanize_text, device
22
- from ai_generate import generate, citations_to_html
 
 
23
 
24
  print(f"Using device: {device}")
25
 
@@ -244,6 +246,7 @@ def predict_mc_scores(input, bc_score):
244
 
245
 
246
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
 
247
  body, references = split_text_from_refs(text)
248
  score, text = detection_polygraf(text=body, model=model)
249
  mc_score = predict_mc_scores(body, score) # mc score
@@ -260,6 +263,7 @@ def ai_check(text: str, option: str):
260
 
261
  def generate_prompt(settings: Dict[str, str]) -> str:
262
  settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
 
263
  prompt = f"""
264
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
265
  """
@@ -273,7 +277,7 @@ Write a {settings['article_length']} words (around) {settings['format']} on {set
273
  - Writing style: {settings['writing_style']}
274
  - Tone: {settings['tone']}
275
  - Target audience: {settings['user_category']}
276
-
277
  Content:
278
  - Depth: {settings['depth_of_content']}
279
  - Structure: {', '.join(settings['structure'])}
@@ -302,7 +306,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
302
  Edit the given text based on user comments.
303
  User Comments:
304
  - {settings['user_comments']}
305
-
306
  Requirements:
307
  - Don't start with "Here is a...", start with the requested text directly
308
  - The original content should not be changed. Make minor modifications based on user comments above.
@@ -310,7 +314,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
310
  - Do not make any headline, title bold.
311
  Context:
312
  - {settings['context']}
313
-
314
  Ensure proper paragraph breaks for better readability.
315
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
316
  """
 
19
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
20
  from google_search import google_search, months, domain_list, build_date
21
  from humanize import humanize_text, device
22
+ from ai_generate import generate, citations_to_html, remove_citations
23
+ import nltk
24
+ nltk.download('punkt_tab')
25
 
26
  print(f"Using device: {device}")
27
 
 
246
 
247
 
248
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
249
+ text = remove_citations(text)
250
  body, references = split_text_from_refs(text)
251
  score, text = detection_polygraf(text=body, model=model)
252
  mc_score = predict_mc_scores(body, score) # mc score
 
263
 
264
  def generate_prompt(settings: Dict[str, str]) -> str:
265
  settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
266
+ # - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
267
  prompt = f"""
268
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
269
  """
 
277
  - Writing style: {settings['writing_style']}
278
  - Tone: {settings['tone']}
279
  - Target audience: {settings['user_category']}
280
+
281
  Content:
282
  - Depth: {settings['depth_of_content']}
283
  - Structure: {', '.join(settings['structure'])}
 
306
  Edit the given text based on user comments.
307
  User Comments:
308
  - {settings['user_comments']}
309
+
310
  Requirements:
311
  - Don't start with "Here is a...", start with the requested text directly
312
  - The original content should not be changed. Make minor modifications based on user comments above.
 
314
  - Do not make any headline, title bold.
315
  Context:
316
  - {settings['context']}
317
+
318
  Ensure proper paragraph breaks for better readability.
319
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
320
  """