Spaces:
Runtime error
Runtime error
updates on prompt + better error handling
Browse files- ai_generate.py +61 -43
- app.py +8 -4
ai_generate.py
CHANGED
@@ -17,6 +17,7 @@ from langchain_anthropic import ChatAnthropic
|
|
17 |
from dotenv import load_dotenv
|
18 |
from langchain_core.output_parsers import XMLOutputParser
|
19 |
from langchain.prompts import ChatPromptTemplate
|
|
|
20 |
|
21 |
load_dotenv()
|
22 |
|
@@ -51,10 +52,19 @@ llm_classes = {
|
|
51 |
|
52 |
xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
|
53 |
of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
|
54 |
-
introductions or general text), don't put a citation for that chunk and just leave citations empty. Otherwise, \
|
55 |
-
list all sources used for that chunk of the text.
|
56 |
-
citations section. Use explicit new lines in the text to show paragraph splits.
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
<cited_text>
|
59 |
<chunk>
|
60 |
<text></text>
|
@@ -95,51 +105,59 @@ def get_doc_content(docs, id):
|
|
95 |
return docs[id].page_content
|
96 |
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
def process_cited_text(data, docs):
|
99 |
# Initialize variables for the combined text and a dictionary for citations
|
100 |
combined_text = ""
|
101 |
citations = {}
|
102 |
# Iterate through the cited_text list
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
citation
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
127 |
|
128 |
return combined_text.strip(), citations
|
129 |
|
130 |
|
131 |
def citations_to_html(citations):
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
143 |
|
144 |
|
145 |
def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
|
@@ -227,13 +245,13 @@ def generate_base(
|
|
227 |
llm = load_llm(model, api_key, temperature, max_length)
|
228 |
if llm is None:
|
229 |
print("Failed to load LLM. Aborting operation.")
|
230 |
-
return None
|
231 |
try:
|
232 |
output = llm.invoke(prompt).content
|
233 |
-
return output
|
234 |
except Exception as e:
|
235 |
print(f"An error occurred while running the model: {e}")
|
236 |
-
return None
|
237 |
|
238 |
|
239 |
def generate(
|
@@ -250,4 +268,4 @@ def generate(
|
|
250 |
if path or url_content:
|
251 |
return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
|
252 |
else:
|
253 |
-
return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
|
|
|
17 |
from dotenv import load_dotenv
|
18 |
from langchain_core.output_parsers import XMLOutputParser
|
19 |
from langchain.prompts import ChatPromptTemplate
|
20 |
+
import re
|
21 |
|
22 |
load_dotenv()
|
23 |
|
|
|
52 |
|
53 |
xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
|
54 |
of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
|
55 |
+
introductions or general text), don't put a citation for that chunk and just leave "citations" section empty. Otherwise, \
|
56 |
+
list all sources used for that chunk of the text. Remember, don't add inline citations in the text itself in any circumstant.
|
57 |
+
Add all citations to the separate citations section. Use explicit new lines in the text to show paragraph splits. For each chunk use this example format:
|
58 |
+
<chunk>
|
59 |
+
<text>This is a sample text chunk....</text>
|
60 |
+
<citations>
|
61 |
+
<citation>1</citation>
|
62 |
+
<citation>3</citation>
|
63 |
+
...
|
64 |
+
</citations>
|
65 |
+
</chunk>
|
66 |
+
If the prompt asks for a reference section, add it in a chunk without any citations
|
67 |
+
Return a citation for every quote across all articles that justify the text. Remember use the following format for your final output:
|
68 |
<cited_text>
|
69 |
<chunk>
|
70 |
<text></text>
|
|
|
105 |
return docs[id].page_content
|
106 |
|
107 |
|
108 |
+
def remove_citations(text):
|
109 |
+
text = re.sub(r'<\d+>', '', text)
|
110 |
+
text = re.sub(r'[\d+]', '', text)
|
111 |
+
return text
|
112 |
+
|
113 |
+
|
114 |
def process_cited_text(data, docs):
|
115 |
# Initialize variables for the combined text and a dictionary for citations
|
116 |
combined_text = ""
|
117 |
citations = {}
|
118 |
# Iterate through the cited_text list
|
119 |
+
if 'cited_text' in data:
|
120 |
+
for item in data['cited_text']:
|
121 |
+
chunk_text = item['chunk'][0]['text']
|
122 |
+
combined_text += chunk_text
|
123 |
+
citation_ids = []
|
124 |
+
# Process the citations for the chunk
|
125 |
+
if item['chunk'][1]['citations']:
|
126 |
+
for c in item['chunk'][1]['citations']:
|
127 |
+
if c and 'citation' in c:
|
128 |
+
citation = c['citation']
|
129 |
+
if isinstance(citation, dict) and "source_id" in citation:
|
130 |
+
citation = citation['source_id']
|
131 |
+
if isinstance(citation, str):
|
132 |
+
try:
|
133 |
+
citation_ids.append(int(citation))
|
134 |
+
except ValueError:
|
135 |
+
pass # Handle cases where the string is not a valid integer
|
136 |
+
if citation_ids:
|
137 |
+
citation_texts = [f"<{cid}>" for cid in citation_ids]
|
138 |
+
combined_text += " " + "".join(citation_texts)
|
139 |
+
combined_text += "\n\n"
|
140 |
+
# Store unique citations in a dictionary
|
141 |
+
for citation_id in citation_ids:
|
142 |
+
if citation_id not in citations:
|
143 |
+
citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
|
144 |
|
145 |
return combined_text.strip(), citations
|
146 |
|
147 |
|
148 |
def citations_to_html(citations):
|
149 |
+
if citations:
|
150 |
+
# Generate the HTML for the unique citations
|
151 |
+
html_content = ""
|
152 |
+
for citation_id, citation_info in citations.items():
|
153 |
+
html_content += (
|
154 |
+
f"<li><strong>Source ID:</strong> {citation_id}<br>"
|
155 |
+
f"<strong>Path:</strong> {citation_info['source']}<br>"
|
156 |
+
f"<strong>Page Content:</strong> {citation_info['content']}</li>"
|
157 |
+
)
|
158 |
+
html_content += "</ul></body></html>"
|
159 |
+
return html_content
|
160 |
+
return ""
|
161 |
|
162 |
|
163 |
def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
|
|
|
245 |
llm = load_llm(model, api_key, temperature, max_length)
|
246 |
if llm is None:
|
247 |
print("Failed to load LLM. Aborting operation.")
|
248 |
+
return None, None
|
249 |
try:
|
250 |
output = llm.invoke(prompt).content
|
251 |
+
return output, None
|
252 |
except Exception as e:
|
253 |
print(f"An error occurred while running the model: {e}")
|
254 |
+
return None, None
|
255 |
|
256 |
|
257 |
def generate(
|
|
|
268 |
if path or url_content:
|
269 |
return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
|
270 |
else:
|
271 |
+
return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
|
app.py
CHANGED
@@ -19,7 +19,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
|
|
19 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
20 |
from google_search import google_search, months, domain_list, build_date
|
21 |
from humanize import humanize_text, device
|
22 |
-
from ai_generate import generate, citations_to_html
|
|
|
|
|
23 |
|
24 |
print(f"Using device: {device}")
|
25 |
|
@@ -244,6 +246,7 @@ def predict_mc_scores(input, bc_score):
|
|
244 |
|
245 |
|
246 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
|
|
247 |
body, references = split_text_from_refs(text)
|
248 |
score, text = detection_polygraf(text=body, model=model)
|
249 |
mc_score = predict_mc_scores(body, score) # mc score
|
@@ -260,6 +263,7 @@ def ai_check(text: str, option: str):
|
|
260 |
|
261 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
262 |
settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
|
|
|
263 |
prompt = f"""
|
264 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
|
265 |
"""
|
@@ -273,7 +277,7 @@ Write a {settings['article_length']} words (around) {settings['format']} on {set
|
|
273 |
- Writing style: {settings['writing_style']}
|
274 |
- Tone: {settings['tone']}
|
275 |
- Target audience: {settings['user_category']}
|
276 |
-
|
277 |
Content:
|
278 |
- Depth: {settings['depth_of_content']}
|
279 |
- Structure: {', '.join(settings['structure'])}
|
@@ -302,7 +306,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
|
|
302 |
Edit the given text based on user comments.
|
303 |
User Comments:
|
304 |
- {settings['user_comments']}
|
305 |
-
|
306 |
Requirements:
|
307 |
- Don't start with "Here is a...", start with the requested text directly
|
308 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
@@ -310,7 +314,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
|
|
310 |
- Do not make any headline, title bold.
|
311 |
Context:
|
312 |
- {settings['context']}
|
313 |
-
|
314 |
Ensure proper paragraph breaks for better readability.
|
315 |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
316 |
"""
|
|
|
19 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
20 |
from google_search import google_search, months, domain_list, build_date
|
21 |
from humanize import humanize_text, device
|
22 |
+
from ai_generate import generate, citations_to_html, remove_citations
|
23 |
+
import nltk
|
24 |
+
nltk.download('punkt_tab')
|
25 |
|
26 |
print(f"Using device: {device}")
|
27 |
|
|
|
246 |
|
247 |
|
248 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
249 |
+
text = remove_citations(text)
|
250 |
body, references = split_text_from_refs(text)
|
251 |
score, text = detection_polygraf(text=body, model=model)
|
252 |
mc_score = predict_mc_scores(body, score) # mc score
|
|
|
263 |
|
264 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
265 |
settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
|
266 |
+
# - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
|
267 |
prompt = f"""
|
268 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
|
269 |
"""
|
|
|
277 |
- Writing style: {settings['writing_style']}
|
278 |
- Tone: {settings['tone']}
|
279 |
- Target audience: {settings['user_category']}
|
280 |
+
|
281 |
Content:
|
282 |
- Depth: {settings['depth_of_content']}
|
283 |
- Structure: {', '.join(settings['structure'])}
|
|
|
306 |
Edit the given text based on user comments.
|
307 |
User Comments:
|
308 |
- {settings['user_comments']}
|
309 |
+
|
310 |
Requirements:
|
311 |
- Don't start with "Here is a...", start with the requested text directly
|
312 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
|
|
314 |
- Do not make any headline, title bold.
|
315 |
Context:
|
316 |
- {settings['context']}
|
317 |
+
|
318 |
Ensure proper paragraph breaks for better readability.
|
319 |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
320 |
"""
|