deeme commited on
Commit
734d84d
·
verified ·
1 Parent(s): a90e364

Upload free_ask_internet.py

Browse files
Files changed (1) hide show
  1. free_ask_internet.py +104 -56
free_ask_internet.py CHANGED
@@ -8,6 +8,7 @@ import trafilatura
8
  from trafilatura import bare_extraction
9
  from concurrent.futures import ThreadPoolExecutor
10
  import concurrent
 
11
  import openai
12
  import time
13
  from datetime import datetime
@@ -18,13 +19,13 @@ import urllib.parse
18
 
19
 
20
  def extract_url_content(url):
21
- downloaded = requests.get(url, headers={
22
- 'Accept-Language': 'en-US,en;q=0.9' # 强制英文内容
23
- }).content
24
- content = trafilatura.extract(downloaded)
25
 
26
- return {"url": url, "content": content}
 
27
 
 
28
 
29
  def search_web_ref(query:str, debug=False):
30
 
@@ -32,7 +33,14 @@ def search_web_ref(query:str, debug=False):
32
 
33
  try:
34
 
35
- safe_string = urllib.parse.quote_plus(":all !general " + query)
 
 
 
 
 
 
 
36
 
37
  searxng_url = os.environ.get('SEARXNG_URL')
38
  response = requests.get(searxng_url + '?q=' + safe_string + '&format=json')
@@ -99,87 +107,116 @@ def search_web_ref(query:str, debug=False):
99
  raise ex
100
 
101
 
102
- def gen_prompt(question, content_list, lang="zh-CN", context_length_limit=11000, debug=False):
 
103
  limit_len = (context_length_limit - 2000)
104
  if len(question) > limit_len:
105
  question = question[0:limit_len]
106
 
107
- ref_content = [item.get("content") for item in content_list]
108
 
109
- answer_language = 'Simplified Chinese'
110
  if lang == "zh-CN":
111
- answer_language = 'Simplified Chinese'
112
  if lang == "zh-TW":
113
- answer_language = 'Traditional Chinese'
114
  if lang == "en-US":
115
- answer_language = 'Professional English'
 
116
 
117
  if len(ref_content) > 0:
118
- prompts = '''
119
- You are a large language AI assistant developed by nash_su. You are given a user question, and please write a clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference number like [[citation:x]], where x is a number. Please use the context and cite the context at the end of each sentence if applicable.
120
- Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context does not provide sufficient information.
121
-
122
- Please cite the contexts with the reference numbers, in the format [citation:x]. If a sentence comes from multiple contexts, please list all applicable citations, like [citation:3][citation:5]. Other than code and specific names and citations, your answer must be written in the same language as the question.
123
- Here are the set of contexts:
124
- ''' + "\n\n" + "```"
125
- ref_index = 1
126
-
127
- for ref_text in ref_content:
128
- prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
129
- ref_index += 1
130
-
131
- if len(prompts) >= limit_len:
132
- prompts = prompts[0:limit_len]
133
- prompts = prompts + '''
134
- ```
135
- Above is the reference contexts. Remember, don't repeat the context word for word. Answer in ''' + answer_language + '''. If the response is lengthy, structure it in paragraphs and summarize where possible. Cite the context using the format [citation:x] where x is the reference number. If a sentence originates from multiple contexts, list all relevant citation numbers, like [citation:3][citation:5]. Don't cluster the citations at the end but include them in the answer where they correspond.
136
- Remember, don't blindly repeat the contexts verbatim. And here is the user question:
137
- ''' + question
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  else:
139
  prompts = question
140
 
141
  if debug:
142
  print(prompts)
143
- print("总长度:" + str(len(prompts)))
144
  return prompts
145
 
146
 
147
- def defaultchat(message, model: str, stream=True, debug=False):
148
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
149
  openai.api_key = os.environ.get('OPENAI_API_KEY')
150
  total_content = ""
 
151
  for chunk in openai.chat.completions.create(
152
  model=model,
153
  messages=message,
154
  stream=True,
155
- max_tokens=3072, temperature=0.2
156
  ):
157
  stream_resp = chunk.dict()
 
158
  token = stream_resp["choices"][0]["delta"].get("content", "")
 
159
  if token:
160
  total_content += token
161
  yield token
162
 
163
  def ask_gpt(message, model_id, debug=False):
 
164
  total_token = ""
165
  for token in defaultchat(message, model_id):
166
  if token:
167
  total_token += token
168
  yield token
169
 
170
- def summary_gpt(message, model: str, debug=False):
 
171
  msgs = []
172
- msgs.append({"role": "system", "content": '作为一位专业的问题审核专家,你的任务是确保每一个提问都是清晰、具体并且没有模糊歧义的,不需要在根据额外的内容就可以理解你的提问。在审阅提问时,请遵循以下规则进行优化:替换模糊的代名词,确保所有的人称和名词都有明确的指代,不允许出现"你我他这那"等这种类似的代名词;如果提问中包含泛指的名词,请根据上下文明确的定语,补充具体的细节以提供完整的信息;最后,只允许输出经过你精确优化的问题,不要有任何多余的文字。'})
173
- msgs.append({"role": "user", "content": str(message)})
174
  json_data = {
175
- "model": model,
176
- "messages": msgs,
177
- "temperature": 0.8,
178
- "max_tokens": 2560,
179
- "top_p": 1,
180
- "frequency_penalty": 0,
181
- "presence_penalty": 0,
182
- "stop": None
183
  }
184
  apiurl = os.environ.get('OPENAI_BASE_URL')
185
  pooltoken = os.environ.get('OPENAI_API_KEY')
@@ -187,11 +224,12 @@ def summary_gpt(message, model: str, debug=False):
187
  'Content-Type': 'application/json',
188
  'Authorization': 'Bearer {}'.format(pooltoken),
189
  }
190
- response = requests.post(apiurl + 'chat/completions', headers=headers, json=json_data)
191
  res = json.loads(response.text)['choices'][0]['message']['content']
 
192
  return res
193
 
194
- def chat(prompt, model: str, stream=True, debug=False):
195
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
196
  openai.api_key = os.environ.get('OPENAI_API_KEY')
197
  total_content = ""
@@ -202,21 +240,26 @@ def chat(prompt, model: str, stream=True, debug=False):
202
  "content": prompt
203
  }],
204
  stream=True,
205
- max_tokens=3072, temperature=0.2
206
  ):
207
  stream_resp = chunk.dict()
208
  token = stream_resp["choices"][0]["delta"].get("content", "")
209
  if token:
 
210
  total_content += token
211
  yield token
212
  if debug:
213
  print(total_content)
 
214
 
215
- def ask_internet(query: str, model: str, debug=False):
 
 
 
216
  content_list = search_web_ref(query,debug=debug)
217
  if debug:
218
  print(content_list)
219
- prompt = gen_prompt(query, content_list, context_length_limit=6000, debug=debug)
220
  total_token = ""
221
 
222
  # 收集所有回答内容
@@ -225,9 +268,11 @@ def ask_internet(query: str, model: str, debug=False):
225
  if token:
226
  total_token += token
227
  response_content += token
228
-
 
229
  # 处理引用链接
230
  if content_list:
 
231
  citation_map = {f"citation:{i+1}": content_list[i].get('url') for i in range(len(content_list))}
232
 
233
  # 替换所有引用为链接
@@ -236,6 +281,8 @@ def ask_internet(query: str, model: str, debug=False):
236
  if url:
237
  modified_content = modified_content.replace(citation, f"[{citation}]({url})")
238
 
 
 
239
  yield modified_content
240
 
241
  yield "\n\n"
@@ -246,7 +293,8 @@ def ask_internet(query: str, model: str, debug=False):
246
  yield "参考资料:\n"
247
  count = 1
248
  for url_content in content_list:
249
- url = url_content.get('url')
250
- yield "*[{}. {}]({})*".format(str(count), url, url)
251
- yield "\n"
252
- count += 1
 
 
8
  from trafilatura import bare_extraction
9
  from concurrent.futures import ThreadPoolExecutor
10
  import concurrent
11
+ import requests
12
  import openai
13
  import time
14
  from datetime import datetime
 
19
 
20
 
21
  def extract_url_content(url):
22
+ downloaded = trafilatura.fetch_url(url)
23
+ content = trafilatura.extract(downloaded)
 
 
24
 
25
+ return {"url":url, "content":content}
26
+
27
 
28
+
29
 
30
  def search_web_ref(query:str, debug=False):
31
 
 
33
 
34
  try:
35
 
36
+ #safe_string = urllib.parse.quote_plus(":all !general " + query)
37
+ # 简单的语言检测:如果查询中包含中文字符,则认为是中文
38
+ if any('\u4e00' <= char <= '\u9fff' for char in query):
39
+ # 中文查询,添加前缀
40
+ safe_string = urllib.parse.quote_plus(":all !general " + query)
41
+ else:
42
+ # 英文或其他语言查询,直接使用查询
43
+ safe_string = urllib.parse.quote_plus(query)
44
 
45
  searxng_url = os.environ.get('SEARXNG_URL')
46
  response = requests.get(searxng_url + '?q=' + safe_string + '&format=json')
 
107
  raise ex
108
 
109
 
110
+ def gen_prompt(question,content_list, lang="zh-CN", context_length_limit=11000,debug=False):
111
+
112
  limit_len = (context_length_limit - 2000)
113
  if len(question) > limit_len:
114
  question = question[0:limit_len]
115
 
116
+ ref_content = [ item.get("content") for item in content_list]
117
 
118
+ answer_language = ' Simplified Chinese '
119
  if lang == "zh-CN":
120
+ answer_language = ' Simplified Chinese '
121
  if lang == "zh-TW":
122
+ answer_language = ' Traditional Chinese '
123
  if lang == "en-US":
124
+ answer_language = ' English '
125
+
126
 
127
  if len(ref_content) > 0:
128
+
129
+ if False:
130
+ prompts = '''
131
+ 您是一位由 nash_su 开发的大型语言人工智能助手。您将被提供一个用户问题,并需要撰写一个清晰、简洁且准确的答案。提供了一组与问题相关的上下文,每个都以 [[citation:x]] 这样的编号开头,x 代表一个数字。请在适当的情况下在句子末尾引用上下文。答案必须正确、精确,并以专家的中立和职业语气���写。请将答案限制在 2000 个标记内。不要提供与问题无关的信息,也不要重复。如果给出的上下文信息不足,请在相关主题后写上“信息缺失:”。请按照引用编号 [citation:x] 的格式在答案中对应部分引用上下文。如果一句话源自多个上下文,请列出所有相关的引用编号,例如 [citation:3][citation:5],不要将引用集中在最后返回,而是在答案对应部分列出。除非是代码、特定的名称或引用编号,答案的语言应与问题相同。以下是上下文的内容集:
132
+ ''' + "\n\n" + "```"
133
+ ref_index = 1
134
+
135
+ for ref_text in ref_content:
136
+
137
+ prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
138
+ ref_index += 1
139
+
140
+ if len(prompts) >= limit_len:
141
+ prompts = prompts[0:limit_len]
142
+ prompts = prompts + '''
143
+ ```
144
+ 记住,不要一字不差的重复上下文内容。回答必须使用简体中文,如果回答很长,请尽量结构化、分段落总结。请按照引用编号 [citation:x] 的格式在答案中对应部分引用上下文。如果一句话源自多个上下文,请列出所有相关的引用编号,例如 [citation:3][citation:5],不要将引用集中在最后返回,而是在答案对应部分列出。下面是用户问题:
145
+ ''' + question
146
+ else:
147
+ prompts = '''
148
+ You are a large language AI assistant develop by nash_su. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference number like [[citation:x]], where x is a number. Please use the context and cite the context at the end of each sentence if applicable.
149
+ Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
150
+
151
+ Please cite the contexts with the reference numbers, in the format [citation:x]. If a sentence comes from multiple contexts, please list all applicable citations, like [citation:3][citation:5]. Other than code and specific names and citations, your answer must be written in the same language as the question.
152
+ Here are the set of contexts:
153
+ ''' + "\n\n" + "```"
154
+ ref_index = 1
155
+
156
+ for ref_text in ref_content:
157
+
158
+ prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
159
+ ref_index += 1
160
+
161
+ if len(prompts) >= limit_len:
162
+ prompts = prompts[0:limit_len]
163
+ prompts = prompts + '''
164
+ ```
165
+ Above is the reference contexts. Remember, don't repeat the context word for word. Answer in ''' + answer_language + '''. If the response is lengthy, structure it in paragraphs and summarize where possible. Cite the context using the format [citation:x] where x is the reference number. If a sentence originates from multiple contexts, list all relevant citation numbers, like [citation:3][citation:5]. Don't cluster the citations at the end but include them in the answer where they correspond.
166
+ Remember, don't blindly repeat the contexts verbatim. And here is the user question:
167
+ ''' + question
168
+
169
+
170
  else:
171
  prompts = question
172
 
173
  if debug:
174
  print(prompts)
175
+ print("总长度:"+ str(len(prompts)))
176
  return prompts
177
 
178
 
179
+ def defaultchat(message, model:str, stream=True, debug=False):
180
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
181
  openai.api_key = os.environ.get('OPENAI_API_KEY')
182
  total_content = ""
183
+ #print(message)
184
  for chunk in openai.chat.completions.create(
185
  model=model,
186
  messages=message,
187
  stream=True,
188
+ max_tokens=3072,temperature=0.2
189
  ):
190
  stream_resp = chunk.dict()
191
+ #print(stream_resp)
192
  token = stream_resp["choices"][0]["delta"].get("content", "")
193
+ #print(token)
194
  if token:
195
  total_content += token
196
  yield token
197
 
198
  def ask_gpt(message, model_id, debug=False):
199
+ #print(message)
200
  total_token = ""
201
  for token in defaultchat(message, model_id):
202
  if token:
203
  total_token += token
204
  yield token
205
 
206
+ def summary_gpt(message, model:str, debug=False):
207
+ #message = '\n'.join([msg.content for msg in message])
208
  msgs = []
209
+ msgs.append({"role": "system", "content": '作为一位专业的问题审核专家,你的任务是确保每一个提问都是清晰、具体并且没有模糊歧义的,不需要在根据额外的内容就可以理解你的提问。在审阅提问时,请遵循以下规则进行优化:替换模糊的代名词,确保所有的人称和名词都有明确的指代,不允许出现"你我他这那"等这种类似的代名词;如果提问中包含泛指的名词,请根据上下文明确的定语,补充具体的细节以提供完整的信息;最后,只允许输出经过你精确优化的问题,不要有任何多余的文字。举例说明,1-当提问者问:他在做什么?,你根据上下文你可以得知他是"小明",那么你优化问题后输出"小明在干什么?"2-当提问者问:他们乐队都有谁?,你根据上下文可以得知乐队是"小强乐队",那么你优化问题后输出"小强乐队都有谁?"'})
210
+ msgs.append({"role": "user", "content":str(message)})
211
  json_data = {
212
+ "model":model,
213
+ "messages":msgs,
214
+ "temperature":0.8,
215
+ "max_tokens":2560,
216
+ "top_p":1,
217
+ "frequency_penalty":0,
218
+ "presence_penalty":0,
219
+ "stop":None
220
  }
221
  apiurl = os.environ.get('OPENAI_BASE_URL')
222
  pooltoken = os.environ.get('OPENAI_API_KEY')
 
224
  'Content-Type': 'application/json',
225
  'Authorization': 'Bearer {}'.format(pooltoken),
226
  }
227
+ response = requests.post( apiurl + 'chat/completions', headers=headers, json=json_data )
228
  res = json.loads(response.text)['choices'][0]['message']['content']
229
+ #print(res)
230
  return res
231
 
232
+ def chat(prompt, model:str, stream=True, debug=False):
233
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
234
  openai.api_key = os.environ.get('OPENAI_API_KEY')
235
  total_content = ""
 
240
  "content": prompt
241
  }],
242
  stream=True,
243
+ max_tokens=3072,temperature=0.2
244
  ):
245
  stream_resp = chunk.dict()
246
  token = stream_resp["choices"][0]["delta"].get("content", "")
247
  if token:
248
+
249
  total_content += token
250
  yield token
251
  if debug:
252
  print(total_content)
253
+
254
 
255
+
256
+
257
+ def ask_internet(query:str, model:str, debug=False):
258
+
259
  content_list = search_web_ref(query,debug=debug)
260
  if debug:
261
  print(content_list)
262
+ prompt = gen_prompt(query,content_list,context_length_limit=6000,debug=debug)
263
  total_token = ""
264
 
265
  # 收集所有回答内容
 
268
  if token:
269
  total_token += token
270
  response_content += token
271
+ #yield token #原始回答
272
+
273
  # 处理引用链接
274
  if content_list:
275
+ # 创建引用到URL的映射
276
  citation_map = {f"citation:{i+1}": content_list[i].get('url') for i in range(len(content_list))}
277
 
278
  # 替换所有引用为链接
 
281
  if url:
282
  modified_content = modified_content.replace(citation, f"[{citation}]({url})")
283
 
284
+ # 输出修改后的内容
285
+ #yield "\n\n修改引用链接后的内容:\n"
286
  yield modified_content
287
 
288
  yield "\n\n"
 
293
  yield "参考资料:\n"
294
  count = 1
295
  for url_content in content_list:
296
+ url = url_content.get('url')
297
+ yield "*[{}. {}]({})*".format(str(count),url,url )
298
+ yield "\n"
299
+ count += 1
300
+