deeme commited on
Commit
0bf3413
·
verified ·
1 Parent(s): d97614e

Upload free_ask_internet.py

Browse files
Files changed (1) hide show
  1. free_ask_internet.py +73 -118
free_ask_internet.py CHANGED
@@ -8,7 +8,6 @@ import trafilatura
8
  from trafilatura import bare_extraction
9
  from concurrent.futures import ThreadPoolExecutor
10
  import concurrent
11
- import requests
12
  import openai
13
  import time
14
  from datetime import datetime
@@ -19,24 +18,27 @@ import urllib.parse
19
 
20
 
21
  def extract_url_content(url):
22
- downloaded = trafilatura.fetch_url(url)
23
- content = trafilatura.extract(downloaded)
 
 
24
 
25
- return {"url":url, "content":content}
26
-
27
 
28
-
29
 
30
- def search_web_ref(query:str, debug=False):
31
-
32
  content_list = []
33
 
34
  try:
35
-
36
  safe_string = urllib.parse.quote_plus(":all !general " + query)
37
 
38
  searxng_url = os.environ.get('SEARXNG_URL')
39
- response = requests.get(searxng_url + '?q=' + safe_string + '&format=json')
 
 
 
 
 
40
  response.raise_for_status()
41
  search_results = response.json()
42
 
@@ -44,7 +46,6 @@ def search_web_ref(query:str, debug=False):
44
  print("JSON Response:")
45
  pprint(search_results)
46
  pedding_urls = []
47
-
48
  conv_links = []
49
 
50
  if search_results.get('results'):
@@ -57,15 +58,15 @@ def search_web_ref(query:str, debug=False):
57
  if url:
58
  url_parsed = urlparse(url)
59
  domain = url_parsed.netloc
60
- icon_url = url_parsed.scheme + '://' + url_parsed.netloc + '/favicon.ico'
61
  site_name = tldextract.extract(url).domain
62
 
63
  conv_links.append({
64
- 'site_name':site_name,
65
- 'icon_url':icon_url,
66
- 'title':name,
67
- 'url':url,
68
- 'snippet':snippet
69
  })
70
 
71
  results = []
@@ -73,143 +74,113 @@ def search_web_ref(query:str, debug=False):
73
 
74
  executor = ThreadPoolExecutor(max_workers=10)
75
  for url in pedding_urls:
76
- futures.append(executor.submit(extract_url_content,url))
77
  try:
78
  for future in futures:
79
  res = future.result(timeout=5)
80
  results.append(res)
81
  except concurrent.futures.TimeoutError:
82
  print("任务执行超时")
83
- executor.shutdown(wait=False,cancel_futures=True)
84
 
85
  for content in results:
86
  if content and content.get('content'):
87
-
88
  item_dict = {
89
- "url":content.get('url'),
90
  "content": content.get('content'),
91
- "length":len(content.get('content'))
92
  }
93
  content_list.append(item_dict)
94
  if debug:
95
  print("URL: {}".format(url))
96
  print("=================")
97
 
98
- return content_list
99
  except Exception as ex:
100
  raise ex
101
 
102
 
103
- def gen_prompt(question,content_list, lang="zh-CN", context_length_limit=11000,debug=False):
104
-
105
  limit_len = (context_length_limit - 2000)
106
  if len(question) > limit_len:
107
  question = question[0:limit_len]
108
 
109
- ref_content = [ item.get("content") for item in content_list]
110
 
111
- answer_language = ' Simplified Chinese '
112
  if lang == "zh-CN":
113
- answer_language = ' Simplified Chinese '
114
  if lang == "zh-TW":
115
- answer_language = ' Traditional Chinese '
116
  if lang == "en-US":
117
- answer_language = ' English '
118
-
119
 
120
  if len(ref_content) > 0:
121
-
122
- if False:
123
- prompts = '''
124
- 您是一位由 nash_su 开发的大型语言人工智能助手。您将被提供一个用户问题,并需要撰写一个清晰、简���且准确的答案。提供了一组与问题相关的上下文,每个都以 [[citation:x]] 这样的编号开头,x 代表一个数字。请在适当的情况下在句子末尾引用上下文。答案必须正确、精确,并以专家的中立和职业语气撰写。请将答案限制在 2000 个标记内。不要提供与问题无关的信息,也不要重复。如果给出的上下文信息不足,请在相关主题后写上“信息缺失:”。请按照引用编号 [citation:x] 的格式在答案中对应部分引用上下文。如果一句话源自多个上下文,请列出所有相关的引用编号,例如 [citation:3][citation:5],不要将引用集中在最后返回,而是在答案对应部分列出。除非是代码、特定的名称或引用编号,答案的语言应与问题相同。以下是上下文的内容集:
125
- ''' + "\n\n" + "```"
126
- ref_index = 1
127
-
128
- for ref_text in ref_content:
129
-
130
- prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
131
- ref_index += 1
132
-
133
- if len(prompts) >= limit_len:
134
- prompts = prompts[0:limit_len]
135
- prompts = prompts + '''
136
- ```
137
- 记住,不要一字不差的重复上下文内容。回答必须使用简体中文,如果回答很长,请尽量结构化、分段落总结。请按照引用编号 [citation:x] 的格式在答案中对应部分引用上下文。如果一句话源自多个上下文,请列出所有相关的引用编号,例如 [citation:3][citation:5],不要将引用集中在最后返回,而是在答案对应部分列出。下面是用户问题:
138
- ''' + question
139
- else:
140
- prompts = '''
141
- You are a large language AI assistant develop by nash_su. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference number like [[citation:x]], where x is a number. Please use the context and cite the context at the end of each sentence if applicable.
142
- Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
143
-
144
- Please cite the contexts with the reference numbers, in the format [citation:x]. If a sentence comes from multiple contexts, please list all applicable citations, like [citation:3][citation:5]. Other than code and specific names and citations, your answer must be written in the same language as the question.
145
- Here are the set of contexts:
146
- ''' + "\n\n" + "```"
147
- ref_index = 1
148
-
149
- for ref_text in ref_content:
150
-
151
- prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
152
- ref_index += 1
153
-
154
- if len(prompts) >= limit_len:
155
- prompts = prompts[0:limit_len]
156
- prompts = prompts + '''
157
- ```
158
- Above is the reference contexts. Remember, don't repeat the context word for word. Answer in ''' + answer_language + '''. If the response is lengthy, structure it in paragraphs and summarize where possible. Cite the context using the format [citation:x] where x is the reference number. If a sentence originates from multiple contexts, list all relevant citation numbers, like [citation:3][citation:5]. Don't cluster the citations at the end but include them in the answer where they correspond.
159
- Remember, don't blindly repeat the contexts verbatim. And here is the user question:
160
- ''' + question
161
-
162
-
163
  else:
164
  prompts = question
165
 
166
  if debug:
167
  print(prompts)
168
- print("总长度:"+ str(len(prompts)))
169
  return prompts
170
 
171
 
172
- def defaultchat(message, model:str, stream=True, debug=False):
173
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
174
  openai.api_key = os.environ.get('OPENAI_API_KEY')
175
  total_content = ""
176
- #print(message)
177
  for chunk in openai.chat.completions.create(
178
  model=model,
179
  messages=message,
180
  stream=True,
181
- max_tokens=3072,temperature=0.2
182
  ):
183
  stream_resp = chunk.dict()
184
- #print(stream_resp)
185
  token = stream_resp["choices"][0]["delta"].get("content", "")
186
- #print(token)
187
  if token:
188
  total_content += token
189
  yield token
190
 
191
  def ask_gpt(message, model_id, debug=False):
192
- #print(message)
193
  total_token = ""
194
  for token in defaultchat(message, model_id):
195
  if token:
196
  total_token += token
197
  yield token
198
 
199
- def summary_gpt(message, model:str, debug=False):
200
- #message = '\n'.join([msg.content for msg in message])
201
  msgs = []
202
- msgs.append({"role": "system", "content": '作为一位专业的问题审核专家,你的任务是确保每一个提问都是清晰、具体并且没有模糊歧义的,不需要在根据额外的内容就可以理解你的提问。在审阅提问时,请遵循以下规则进行优化:替换模糊的代名词,确保所有的人称和名词都有明确的指代,不允许出现"你我他这那"等这种类似的代名词;如果提问中包含泛指的名词,请根据上下文明确的定语,补充具体的细节以提供完整的信息;最后,只允许输出经过你精确优化的问题,不要有任何多余的文字。举例说明,1-当提问者问:他在做什么?,你根据上下文你可以得知他是"小明",那么你优化问题后输出"小明在干什么?"2-当提问者问:他们乐队都有谁?,你根据上下文可以得知乐队是"小强乐队",那么你优化问题后输出"小强乐队都有谁?"'})
203
- msgs.append({"role": "user", "content":str(message)})
204
  json_data = {
205
- "model":model,
206
- "messages":msgs,
207
- "temperature":0.8,
208
- "max_tokens":2560,
209
- "top_p":1,
210
- "frequency_penalty":0,
211
- "presence_penalty":0,
212
- "stop":None
213
  }
214
  apiurl = os.environ.get('OPENAI_BASE_URL')
215
  pooltoken = os.environ.get('OPENAI_API_KEY')
@@ -217,12 +188,11 @@ def summary_gpt(message, model:str, debug=False):
217
  'Content-Type': 'application/json',
218
  'Authorization': 'Bearer {}'.format(pooltoken),
219
  }
220
- response = requests.post( apiurl + 'chat/completions', headers=headers, json=json_data )
221
  res = json.loads(response.text)['choices'][0]['message']['content']
222
- #print(res)
223
  return res
224
 
225
- def chat(prompt, model:str, stream=True, debug=False):
226
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
227
  openai.api_key = os.environ.get('OPENAI_API_KEY')
228
  total_content = ""
@@ -233,26 +203,21 @@ def chat(prompt, model:str, stream=True, debug=False):
233
  "content": prompt
234
  }],
235
  stream=True,
236
- max_tokens=3072,temperature=0.2
237
  ):
238
  stream_resp = chunk.dict()
239
  token = stream_resp["choices"][0]["delta"].get("content", "")
240
  if token:
241
-
242
  total_content += token
243
  yield token
244
  if debug:
245
  print(total_content)
246
-
247
 
248
-
249
-
250
- def ask_internet(query:str, model:str, debug=False):
251
-
252
- content_list = search_web_ref(query,debug=debug)
253
  if debug:
254
  print(content_list)
255
- prompt = gen_prompt(query,content_list,context_length_limit=6000,debug=debug)
256
  total_token = ""
257
 
258
  # 收集所有回答内容
@@ -261,16 +226,9 @@ def ask_internet(query:str, model:str, debug=False):
261
  if token:
262
  total_token += token
263
  response_content += token
264
- #yield token #原始回答
265
 
266
  # 处理引用链接
267
  if content_list:
268
- # 使用更灵活的引用识别方式
269
- import re
270
-
271
- # 处理引用链接
272
- if content_list:
273
- # 创建引用到URL的映射
274
  citation_map = {f"citation:{i+1}": content_list[i].get('url') for i in range(len(content_list))}
275
 
276
  # 替换所有引用为链接
@@ -279,8 +237,6 @@ def ask_internet(query:str, model:str, debug=False):
279
  if url:
280
  modified_content = modified_content.replace(citation, f"[{citation}]({url})")
281
 
282
- # 输出修改后的内容
283
- #yield "\n\n修改引用链接后的内容:\n"
284
  yield modified_content
285
 
286
  yield "\n\n"
@@ -291,8 +247,7 @@ def ask_internet(query:str, model:str, debug=False):
291
  yield "参考资料:\n"
292
  count = 1
293
  for url_content in content_list:
294
- url = url_content.get('url')
295
- yield "*[{}. {}]({})*".format(str(count),url,url )
296
- yield "\n"
297
- count += 1
298
-
 
8
  from trafilatura import bare_extraction
9
  from concurrent.futures import ThreadPoolExecutor
10
  import concurrent
 
11
  import openai
12
  import time
13
  from datetime import datetime
 
18
 
19
 
20
  def extract_url_content(url):
21
+ downloaded = requests.get(url, headers={
22
+ 'Accept-Language': 'en-US,en;q=0.9' # 强制英文内容
23
+ }).content
24
+ content = trafilatura.extract(downloaded)
25
 
26
+ return {"url": url, "content": content}
 
27
 
 
28
 
29
+ def search_web_ref(query: str, lang="zh-CN", debug=False):
 
30
  content_list = []
31
 
32
  try:
 
33
  safe_string = urllib.parse.quote_plus(":all !general " + query)
34
 
35
  searxng_url = os.environ.get('SEARXNG_URL')
36
+ params = {
37
+ "q": safe_string,
38
+ "language": "zh-CN" if lang.startswith("zh") else "en-US",
39
+ "time_range": "day" # 限制当天结果
40
+ }
41
+ response = requests.get(searxng_url, params=params)
42
  response.raise_for_status()
43
  search_results = response.json()
44
 
 
46
  print("JSON Response:")
47
  pprint(search_results)
48
  pedding_urls = []
 
49
  conv_links = []
50
 
51
  if search_results.get('results'):
 
58
  if url:
59
  url_parsed = urlparse(url)
60
  domain = url_parsed.netloc
61
+ icon_url = url_parsed.scheme + '://' + url_parsed.netloc + '/favicon.ico'
62
  site_name = tldextract.extract(url).domain
63
 
64
  conv_links.append({
65
+ 'site_name': site_name,
66
+ 'icon_url': icon_url,
67
+ 'title': name,
68
+ 'url': url,
69
+ 'snippet': snippet
70
  })
71
 
72
  results = []
 
74
 
75
  executor = ThreadPoolExecutor(max_workers=10)
76
  for url in pedding_urls:
77
+ futures.append(executor.submit(extract_url_content, url))
78
  try:
79
  for future in futures:
80
  res = future.result(timeout=5)
81
  results.append(res)
82
  except concurrent.futures.TimeoutError:
83
  print("任务执行超时")
84
+ executor.shutdown(wait=False, cancel_futures=True)
85
 
86
  for content in results:
87
  if content and content.get('content'):
 
88
  item_dict = {
89
+ "url": content.get('url'),
90
  "content": content.get('content'),
91
+ "length": len(content.get('content'))
92
  }
93
  content_list.append(item_dict)
94
  if debug:
95
  print("URL: {}".format(url))
96
  print("=================")
97
 
98
+ return content_list
99
  except Exception as ex:
100
  raise ex
101
 
102
 
103
+ def gen_prompt(question, content_list, lang="zh-CN", context_length_limit=11000, debug=False):
 
104
  limit_len = (context_length_limit - 2000)
105
  if len(question) > limit_len:
106
  question = question[0:limit_len]
107
 
108
+ ref_content = [item.get("content") for item in content_list]
109
 
110
+ answer_language = 'Simplified Chinese'
111
  if lang == "zh-CN":
112
+ answer_language = 'Simplified Chinese'
113
  if lang == "zh-TW":
114
+ answer_language = 'Traditional Chinese'
115
  if lang == "en-US":
116
+ answer_language = 'Professional English'
 
117
 
118
  if len(ref_content) > 0:
119
+ prompts = '''
120
+ You are a large language AI assistant developed by nash_su. You are given a user question, and please write a clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference number like [[citation:x]], where x is a number. Please use the context and cite the context at the end of each sentence if applicable.
121
+ Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context does not provide sufficient information.
122
+
123
+ Please cite the contexts with the reference numbers, in the format [citation:x]. If a sentence comes from multiple contexts, please list all applicable citations, like [citation:3][citation:5]. Other than code and specific names and citations, your answer must be written in the same language as the question.
124
+ Here are the set of contexts:
125
+ ''' + "\n\n" + "```"
126
+ ref_index = 1
127
+
128
+ for ref_text in ref_content:
129
+ prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
130
+ ref_index += 1
131
+
132
+ if len(prompts) >= limit_len:
133
+ prompts = prompts[0:limit_len]
134
+ prompts = prompts + '''
135
+ ```
136
+ Above is the reference contexts. Remember, don't repeat the context word for word. Answer in ''' + answer_language + '''. If the response is lengthy, structure it in paragraphs and summarize where possible. Cite the context using the format [citation:x] where x is the reference number. If a sentence originates from multiple contexts, list all relevant citation numbers, like [citation:3][citation:5]. Don't cluster the citations at the end but include them in the answer where they correspond.
137
+ Remember, don't blindly repeat the contexts verbatim. And here is the user question:
138
+ ''' + question
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  else:
140
  prompts = question
141
 
142
  if debug:
143
  print(prompts)
144
+ print("总长度:" + str(len(prompts)))
145
  return prompts
146
 
147
 
148
+ def defaultchat(message, model: str, stream=True, debug=False):
149
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
150
  openai.api_key = os.environ.get('OPENAI_API_KEY')
151
  total_content = ""
 
152
  for chunk in openai.chat.completions.create(
153
  model=model,
154
  messages=message,
155
  stream=True,
156
+ max_tokens=3072, temperature=0.2
157
  ):
158
  stream_resp = chunk.dict()
 
159
  token = stream_resp["choices"][0]["delta"].get("content", "")
 
160
  if token:
161
  total_content += token
162
  yield token
163
 
164
  def ask_gpt(message, model_id, debug=False):
 
165
  total_token = ""
166
  for token in defaultchat(message, model_id):
167
  if token:
168
  total_token += token
169
  yield token
170
 
171
+ def summary_gpt(message, model: str, debug=False):
 
172
  msgs = []
173
+ msgs.append({"role": "system", "content": '作为一位专业的问题审核专家,你的任务是确保每一个提问都是清晰、具体并且没有模糊歧义的,不需要在根据额外的内容就可以理解你的提问。在审阅提问时,请遵循以下规则进行优化:替换模糊的代名词,确保所有的人称和名词都有明确的指代,不允许出现"你我他这那"等这种类似的代名词;如果提问中包含泛指的名词,请根据上下文明确的定语,补充具体的细节以提供完整的信息;最后,只允许输出经过你精确优化的问题,不要有任何多余的文字。'})
174
+ msgs.append({"role": "user", "content": str(message)})
175
  json_data = {
176
+ "model": model,
177
+ "messages": msgs,
178
+ "temperature": 0.8,
179
+ "max_tokens": 2560,
180
+ "top_p": 1,
181
+ "frequency_penalty": 0,
182
+ "presence_penalty": 0,
183
+ "stop": None
184
  }
185
  apiurl = os.environ.get('OPENAI_BASE_URL')
186
  pooltoken = os.environ.get('OPENAI_API_KEY')
 
188
  'Content-Type': 'application/json',
189
  'Authorization': 'Bearer {}'.format(pooltoken),
190
  }
191
+ response = requests.post(apiurl + 'chat/completions', headers=headers, json=json_data)
192
  res = json.loads(response.text)['choices'][0]['message']['content']
 
193
  return res
194
 
195
+ def chat(prompt, model: str, stream=True, debug=False):
196
  openai.base_url = os.environ.get('OPENAI_BASE_URL')
197
  openai.api_key = os.environ.get('OPENAI_API_KEY')
198
  total_content = ""
 
203
  "content": prompt
204
  }],
205
  stream=True,
206
+ max_tokens=3072, temperature=0.2
207
  ):
208
  stream_resp = chunk.dict()
209
  token = stream_resp["choices"][0]["delta"].get("content", "")
210
  if token:
 
211
  total_content += token
212
  yield token
213
  if debug:
214
  print(total_content)
 
215
 
216
+ def ask_internet(query: str, model: str, debug=False):
217
+ content_list = search_web_ref(query, lang="en-US", debug=debug) # 确保使用英文
 
 
 
218
  if debug:
219
  print(content_list)
220
+ prompt = gen_prompt(query, content_list, context_length_limit=6000, debug=debug)
221
  total_token = ""
222
 
223
  # 收集所有回答内容
 
226
  if token:
227
  total_token += token
228
  response_content += token
 
229
 
230
  # 处理引用链接
231
  if content_list:
 
 
 
 
 
 
232
  citation_map = {f"citation:{i+1}": content_list[i].get('url') for i in range(len(content_list))}
233
 
234
  # 替换所有引用为链接
 
237
  if url:
238
  modified_content = modified_content.replace(citation, f"[{citation}]({url})")
239
 
 
 
240
  yield modified_content
241
 
242
  yield "\n\n"
 
247
  yield "参考资料:\n"
248
  count = 1
249
  for url_content in content_list:
250
+ url = url_content.get('url')
251
+ yield "*[{}. {}]({})*".format(str(count), url, url)
252
+ yield "\n"
253
+ count += 1