Upload free_ask_internet.py
Browse files- free_ask_internet.py +73 -118
free_ask_internet.py
CHANGED
@@ -8,7 +8,6 @@ import trafilatura
|
|
8 |
from trafilatura import bare_extraction
|
9 |
from concurrent.futures import ThreadPoolExecutor
|
10 |
import concurrent
|
11 |
-
import requests
|
12 |
import openai
|
13 |
import time
|
14 |
from datetime import datetime
|
@@ -19,24 +18,27 @@ import urllib.parse
|
|
19 |
|
20 |
|
21 |
def extract_url_content(url):
|
22 |
-
downloaded =
|
23 |
-
|
|
|
|
|
24 |
|
25 |
-
return {"url":url, "content":content}
|
26 |
-
|
27 |
|
28 |
-
|
29 |
|
30 |
-
def search_web_ref(query:str, debug=False):
|
31 |
-
|
32 |
content_list = []
|
33 |
|
34 |
try:
|
35 |
-
|
36 |
safe_string = urllib.parse.quote_plus(":all !general " + query)
|
37 |
|
38 |
searxng_url = os.environ.get('SEARXNG_URL')
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
40 |
response.raise_for_status()
|
41 |
search_results = response.json()
|
42 |
|
@@ -44,7 +46,6 @@ def search_web_ref(query:str, debug=False):
|
|
44 |
print("JSON Response:")
|
45 |
pprint(search_results)
|
46 |
pedding_urls = []
|
47 |
-
|
48 |
conv_links = []
|
49 |
|
50 |
if search_results.get('results'):
|
@@ -57,15 +58,15 @@ def search_web_ref(query:str, debug=False):
|
|
57 |
if url:
|
58 |
url_parsed = urlparse(url)
|
59 |
domain = url_parsed.netloc
|
60 |
-
icon_url =
|
61 |
site_name = tldextract.extract(url).domain
|
62 |
|
63 |
conv_links.append({
|
64 |
-
'site_name':site_name,
|
65 |
-
'icon_url':icon_url,
|
66 |
-
'title':name,
|
67 |
-
'url':url,
|
68 |
-
'snippet':snippet
|
69 |
})
|
70 |
|
71 |
results = []
|
@@ -73,143 +74,113 @@ def search_web_ref(query:str, debug=False):
|
|
73 |
|
74 |
executor = ThreadPoolExecutor(max_workers=10)
|
75 |
for url in pedding_urls:
|
76 |
-
futures.append(executor.submit(extract_url_content,url))
|
77 |
try:
|
78 |
for future in futures:
|
79 |
res = future.result(timeout=5)
|
80 |
results.append(res)
|
81 |
except concurrent.futures.TimeoutError:
|
82 |
print("任务执行超时")
|
83 |
-
executor.shutdown(wait=False,cancel_futures=True)
|
84 |
|
85 |
for content in results:
|
86 |
if content and content.get('content'):
|
87 |
-
|
88 |
item_dict = {
|
89 |
-
"url":content.get('url'),
|
90 |
"content": content.get('content'),
|
91 |
-
"length":len(content.get('content'))
|
92 |
}
|
93 |
content_list.append(item_dict)
|
94 |
if debug:
|
95 |
print("URL: {}".format(url))
|
96 |
print("=================")
|
97 |
|
98 |
-
return
|
99 |
except Exception as ex:
|
100 |
raise ex
|
101 |
|
102 |
|
103 |
-
def gen_prompt(question,content_list, lang="zh-CN", context_length_limit=11000,debug=False):
|
104 |
-
|
105 |
limit_len = (context_length_limit - 2000)
|
106 |
if len(question) > limit_len:
|
107 |
question = question[0:limit_len]
|
108 |
|
109 |
-
ref_content = [
|
110 |
|
111 |
-
answer_language = '
|
112 |
if lang == "zh-CN":
|
113 |
-
answer_language = '
|
114 |
if lang == "zh-TW":
|
115 |
-
answer_language = '
|
116 |
if lang == "en-US":
|
117 |
-
answer_language = ' English
|
118 |
-
|
119 |
|
120 |
if len(ref_content) > 0:
|
121 |
-
|
122 |
-
if
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
prompts = prompts
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
You are a large language AI assistant develop by nash_su. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference number like [[citation:x]], where x is a number. Please use the context and cite the context at the end of each sentence if applicable.
|
142 |
-
Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
|
143 |
-
|
144 |
-
Please cite the contexts with the reference numbers, in the format [citation:x]. If a sentence comes from multiple contexts, please list all applicable citations, like [citation:3][citation:5]. Other than code and specific names and citations, your answer must be written in the same language as the question.
|
145 |
-
Here are the set of contexts:
|
146 |
-
''' + "\n\n" + "```"
|
147 |
-
ref_index = 1
|
148 |
-
|
149 |
-
for ref_text in ref_content:
|
150 |
-
|
151 |
-
prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
|
152 |
-
ref_index += 1
|
153 |
-
|
154 |
-
if len(prompts) >= limit_len:
|
155 |
-
prompts = prompts[0:limit_len]
|
156 |
-
prompts = prompts + '''
|
157 |
-
```
|
158 |
-
Above is the reference contexts. Remember, don't repeat the context word for word. Answer in ''' + answer_language + '''. If the response is lengthy, structure it in paragraphs and summarize where possible. Cite the context using the format [citation:x] where x is the reference number. If a sentence originates from multiple contexts, list all relevant citation numbers, like [citation:3][citation:5]. Don't cluster the citations at the end but include them in the answer where they correspond.
|
159 |
-
Remember, don't blindly repeat the contexts verbatim. And here is the user question:
|
160 |
-
''' + question
|
161 |
-
|
162 |
-
|
163 |
else:
|
164 |
prompts = question
|
165 |
|
166 |
if debug:
|
167 |
print(prompts)
|
168 |
-
print("总长度:"+ str(len(prompts)))
|
169 |
return prompts
|
170 |
|
171 |
|
172 |
-
def defaultchat(message, model:str, stream=True, debug=False):
|
173 |
openai.base_url = os.environ.get('OPENAI_BASE_URL')
|
174 |
openai.api_key = os.environ.get('OPENAI_API_KEY')
|
175 |
total_content = ""
|
176 |
-
#print(message)
|
177 |
for chunk in openai.chat.completions.create(
|
178 |
model=model,
|
179 |
messages=message,
|
180 |
stream=True,
|
181 |
-
max_tokens=3072,temperature=0.2
|
182 |
):
|
183 |
stream_resp = chunk.dict()
|
184 |
-
#print(stream_resp)
|
185 |
token = stream_resp["choices"][0]["delta"].get("content", "")
|
186 |
-
#print(token)
|
187 |
if token:
|
188 |
total_content += token
|
189 |
yield token
|
190 |
|
191 |
def ask_gpt(message, model_id, debug=False):
|
192 |
-
#print(message)
|
193 |
total_token = ""
|
194 |
for token in defaultchat(message, model_id):
|
195 |
if token:
|
196 |
total_token += token
|
197 |
yield token
|
198 |
|
199 |
-
def summary_gpt(message,
|
200 |
-
#message = '\n'.join([msg.content for msg in message])
|
201 |
msgs = []
|
202 |
-
msgs.append({"role": "system", "content": '作为一位专业的问题审核专家,你的任务是确保每一个提问都是清晰、具体并且没有模糊歧义的,不需要在根据额外的内容就可以理解你的提问。在审阅提问时,请遵循以下规则进行优化:替换模糊的代名词,确保所有的人称和名词都有明确的指代,不允许出现"你我他这那"
|
203 |
-
msgs.append({"role": "user", "content":str(message)})
|
204 |
json_data = {
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
}
|
214 |
apiurl = os.environ.get('OPENAI_BASE_URL')
|
215 |
pooltoken = os.environ.get('OPENAI_API_KEY')
|
@@ -217,12 +188,11 @@ def summary_gpt(message, model:str, debug=False):
|
|
217 |
'Content-Type': 'application/json',
|
218 |
'Authorization': 'Bearer {}'.format(pooltoken),
|
219 |
}
|
220 |
-
response = requests.post(
|
221 |
res = json.loads(response.text)['choices'][0]['message']['content']
|
222 |
-
#print(res)
|
223 |
return res
|
224 |
|
225 |
-
def chat(prompt, model:str, stream=True, debug=False):
|
226 |
openai.base_url = os.environ.get('OPENAI_BASE_URL')
|
227 |
openai.api_key = os.environ.get('OPENAI_API_KEY')
|
228 |
total_content = ""
|
@@ -233,26 +203,21 @@ def chat(prompt, model:str, stream=True, debug=False):
|
|
233 |
"content": prompt
|
234 |
}],
|
235 |
stream=True,
|
236 |
-
max_tokens=3072,temperature=0.2
|
237 |
):
|
238 |
stream_resp = chunk.dict()
|
239 |
token = stream_resp["choices"][0]["delta"].get("content", "")
|
240 |
if token:
|
241 |
-
|
242 |
total_content += token
|
243 |
yield token
|
244 |
if debug:
|
245 |
print(total_content)
|
246 |
-
|
247 |
|
248 |
-
|
249 |
-
|
250 |
-
def ask_internet(query:str, model:str, debug=False):
|
251 |
-
|
252 |
-
content_list = search_web_ref(query,debug=debug)
|
253 |
if debug:
|
254 |
print(content_list)
|
255 |
-
prompt = gen_prompt(query,content_list,context_length_limit=6000,debug=debug)
|
256 |
total_token = ""
|
257 |
|
258 |
# 收集所有回答内容
|
@@ -261,16 +226,9 @@ def ask_internet(query:str, model:str, debug=False):
|
|
261 |
if token:
|
262 |
total_token += token
|
263 |
response_content += token
|
264 |
-
#yield token #原始回答
|
265 |
|
266 |
# 处理引用链接
|
267 |
if content_list:
|
268 |
-
# 使用更灵活的引用识别方式
|
269 |
-
import re
|
270 |
-
|
271 |
-
# 处理引用链接
|
272 |
-
if content_list:
|
273 |
-
# 创建引用到URL的映射
|
274 |
citation_map = {f"citation:{i+1}": content_list[i].get('url') for i in range(len(content_list))}
|
275 |
|
276 |
# 替换所有引用为链接
|
@@ -279,8 +237,6 @@ def ask_internet(query:str, model:str, debug=False):
|
|
279 |
if url:
|
280 |
modified_content = modified_content.replace(citation, f"[{citation}]({url})")
|
281 |
|
282 |
-
# 输出修改后的内容
|
283 |
-
#yield "\n\n修改引用链接后的内容:\n"
|
284 |
yield modified_content
|
285 |
|
286 |
yield "\n\n"
|
@@ -291,8 +247,7 @@ def ask_internet(query:str, model:str, debug=False):
|
|
291 |
yield "参考资料:\n"
|
292 |
count = 1
|
293 |
for url_content in content_list:
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
|
|
8 |
from trafilatura import bare_extraction
|
9 |
from concurrent.futures import ThreadPoolExecutor
|
10 |
import concurrent
|
|
|
11 |
import openai
|
12 |
import time
|
13 |
from datetime import datetime
|
|
|
18 |
|
19 |
|
20 |
def extract_url_content(url):
|
21 |
+
downloaded = requests.get(url, headers={
|
22 |
+
'Accept-Language': 'en-US,en;q=0.9' # 强制英文内容
|
23 |
+
}).content
|
24 |
+
content = trafilatura.extract(downloaded)
|
25 |
|
26 |
+
return {"url": url, "content": content}
|
|
|
27 |
|
|
|
28 |
|
29 |
+
def search_web_ref(query: str, lang="zh-CN", debug=False):
|
|
|
30 |
content_list = []
|
31 |
|
32 |
try:
|
|
|
33 |
safe_string = urllib.parse.quote_plus(":all !general " + query)
|
34 |
|
35 |
searxng_url = os.environ.get('SEARXNG_URL')
|
36 |
+
params = {
|
37 |
+
"q": safe_string,
|
38 |
+
"language": "zh-CN" if lang.startswith("zh") else "en-US",
|
39 |
+
"time_range": "day" # 限制当天结果
|
40 |
+
}
|
41 |
+
response = requests.get(searxng_url, params=params)
|
42 |
response.raise_for_status()
|
43 |
search_results = response.json()
|
44 |
|
|
|
46 |
print("JSON Response:")
|
47 |
pprint(search_results)
|
48 |
pedding_urls = []
|
|
|
49 |
conv_links = []
|
50 |
|
51 |
if search_results.get('results'):
|
|
|
58 |
if url:
|
59 |
url_parsed = urlparse(url)
|
60 |
domain = url_parsed.netloc
|
61 |
+
icon_url = url_parsed.scheme + '://' + url_parsed.netloc + '/favicon.ico'
|
62 |
site_name = tldextract.extract(url).domain
|
63 |
|
64 |
conv_links.append({
|
65 |
+
'site_name': site_name,
|
66 |
+
'icon_url': icon_url,
|
67 |
+
'title': name,
|
68 |
+
'url': url,
|
69 |
+
'snippet': snippet
|
70 |
})
|
71 |
|
72 |
results = []
|
|
|
74 |
|
75 |
executor = ThreadPoolExecutor(max_workers=10)
|
76 |
for url in pedding_urls:
|
77 |
+
futures.append(executor.submit(extract_url_content, url))
|
78 |
try:
|
79 |
for future in futures:
|
80 |
res = future.result(timeout=5)
|
81 |
results.append(res)
|
82 |
except concurrent.futures.TimeoutError:
|
83 |
print("任务执行超时")
|
84 |
+
executor.shutdown(wait=False, cancel_futures=True)
|
85 |
|
86 |
for content in results:
|
87 |
if content and content.get('content'):
|
|
|
88 |
item_dict = {
|
89 |
+
"url": content.get('url'),
|
90 |
"content": content.get('content'),
|
91 |
+
"length": len(content.get('content'))
|
92 |
}
|
93 |
content_list.append(item_dict)
|
94 |
if debug:
|
95 |
print("URL: {}".format(url))
|
96 |
print("=================")
|
97 |
|
98 |
+
return content_list
|
99 |
except Exception as ex:
|
100 |
raise ex
|
101 |
|
102 |
|
103 |
+
def gen_prompt(question, content_list, lang="zh-CN", context_length_limit=11000, debug=False):
|
|
|
104 |
limit_len = (context_length_limit - 2000)
|
105 |
if len(question) > limit_len:
|
106 |
question = question[0:limit_len]
|
107 |
|
108 |
+
ref_content = [item.get("content") for item in content_list]
|
109 |
|
110 |
+
answer_language = 'Simplified Chinese'
|
111 |
if lang == "zh-CN":
|
112 |
+
answer_language = 'Simplified Chinese'
|
113 |
if lang == "zh-TW":
|
114 |
+
answer_language = 'Traditional Chinese'
|
115 |
if lang == "en-US":
|
116 |
+
answer_language = 'Professional English'
|
|
|
117 |
|
118 |
if len(ref_content) > 0:
|
119 |
+
prompts = '''
|
120 |
+
You are a large language AI assistant developed by nash_su. You are given a user question, and please write a clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference number like [[citation:x]], where x is a number. Please use the context and cite the context at the end of each sentence if applicable.
|
121 |
+
Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context does not provide sufficient information.
|
122 |
+
|
123 |
+
Please cite the contexts with the reference numbers, in the format [citation:x]. If a sentence comes from multiple contexts, please list all applicable citations, like [citation:3][citation:5]. Other than code and specific names and citations, your answer must be written in the same language as the question.
|
124 |
+
Here are the set of contexts:
|
125 |
+
''' + "\n\n" + "```"
|
126 |
+
ref_index = 1
|
127 |
+
|
128 |
+
for ref_text in ref_content:
|
129 |
+
prompts = prompts + "\n\n" + " [citation:{}] ".format(str(ref_index)) + ref_text
|
130 |
+
ref_index += 1
|
131 |
+
|
132 |
+
if len(prompts) >= limit_len:
|
133 |
+
prompts = prompts[0:limit_len]
|
134 |
+
prompts = prompts + '''
|
135 |
+
```
|
136 |
+
Above is the reference contexts. Remember, don't repeat the context word for word. Answer in ''' + answer_language + '''. If the response is lengthy, structure it in paragraphs and summarize where possible. Cite the context using the format [citation:x] where x is the reference number. If a sentence originates from multiple contexts, list all relevant citation numbers, like [citation:3][citation:5]. Don't cluster the citations at the end but include them in the answer where they correspond.
|
137 |
+
Remember, don't blindly repeat the contexts verbatim. And here is the user question:
|
138 |
+
''' + question
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
else:
|
140 |
prompts = question
|
141 |
|
142 |
if debug:
|
143 |
print(prompts)
|
144 |
+
print("总长度:" + str(len(prompts)))
|
145 |
return prompts
|
146 |
|
147 |
|
148 |
+
def defaultchat(message, model: str, stream=True, debug=False):
|
149 |
openai.base_url = os.environ.get('OPENAI_BASE_URL')
|
150 |
openai.api_key = os.environ.get('OPENAI_API_KEY')
|
151 |
total_content = ""
|
|
|
152 |
for chunk in openai.chat.completions.create(
|
153 |
model=model,
|
154 |
messages=message,
|
155 |
stream=True,
|
156 |
+
max_tokens=3072, temperature=0.2
|
157 |
):
|
158 |
stream_resp = chunk.dict()
|
|
|
159 |
token = stream_resp["choices"][0]["delta"].get("content", "")
|
|
|
160 |
if token:
|
161 |
total_content += token
|
162 |
yield token
|
163 |
|
164 |
def ask_gpt(message, model_id, debug=False):
|
|
|
165 |
total_token = ""
|
166 |
for token in defaultchat(message, model_id):
|
167 |
if token:
|
168 |
total_token += token
|
169 |
yield token
|
170 |
|
171 |
+
def summary_gpt(message, model: str, debug=False):
|
|
|
172 |
msgs = []
|
173 |
+
msgs.append({"role": "system", "content": '作为一位专业的问题审核专家,你的任务是确保每一个提问都是清晰、具体并且没有模糊歧义的,不需要在根据额外的内容就可以理解你的提问。在审阅提问时,请遵循以下规则进行优化:替换模糊的代名词,确保所有的人称和名词都有明确的指代,不允许出现"你我他这那"等这种类似的代名词;如果提问中包含泛指的名词,请根据上下文明确的定语,补充具体的细节以提供完整的信息;最后,只允许输出经过你精确优化的问题,不要有任何多余的文字。'})
|
174 |
+
msgs.append({"role": "user", "content": str(message)})
|
175 |
json_data = {
|
176 |
+
"model": model,
|
177 |
+
"messages": msgs,
|
178 |
+
"temperature": 0.8,
|
179 |
+
"max_tokens": 2560,
|
180 |
+
"top_p": 1,
|
181 |
+
"frequency_penalty": 0,
|
182 |
+
"presence_penalty": 0,
|
183 |
+
"stop": None
|
184 |
}
|
185 |
apiurl = os.environ.get('OPENAI_BASE_URL')
|
186 |
pooltoken = os.environ.get('OPENAI_API_KEY')
|
|
|
188 |
'Content-Type': 'application/json',
|
189 |
'Authorization': 'Bearer {}'.format(pooltoken),
|
190 |
}
|
191 |
+
response = requests.post(apiurl + 'chat/completions', headers=headers, json=json_data)
|
192 |
res = json.loads(response.text)['choices'][0]['message']['content']
|
|
|
193 |
return res
|
194 |
|
195 |
+
def chat(prompt, model: str, stream=True, debug=False):
|
196 |
openai.base_url = os.environ.get('OPENAI_BASE_URL')
|
197 |
openai.api_key = os.environ.get('OPENAI_API_KEY')
|
198 |
total_content = ""
|
|
|
203 |
"content": prompt
|
204 |
}],
|
205 |
stream=True,
|
206 |
+
max_tokens=3072, temperature=0.2
|
207 |
):
|
208 |
stream_resp = chunk.dict()
|
209 |
token = stream_resp["choices"][0]["delta"].get("content", "")
|
210 |
if token:
|
|
|
211 |
total_content += token
|
212 |
yield token
|
213 |
if debug:
|
214 |
print(total_content)
|
|
|
215 |
|
216 |
+
def ask_internet(query: str, model: str, debug=False):
|
217 |
+
content_list = search_web_ref(query, lang="en-US", debug=debug) # 确保使用英文
|
|
|
|
|
|
|
218 |
if debug:
|
219 |
print(content_list)
|
220 |
+
prompt = gen_prompt(query, content_list, context_length_limit=6000, debug=debug)
|
221 |
total_token = ""
|
222 |
|
223 |
# 收集所有回答内容
|
|
|
226 |
if token:
|
227 |
total_token += token
|
228 |
response_content += token
|
|
|
229 |
|
230 |
# 处理引用链接
|
231 |
if content_list:
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
citation_map = {f"citation:{i+1}": content_list[i].get('url') for i in range(len(content_list))}
|
233 |
|
234 |
# 替换所有引用为链接
|
|
|
237 |
if url:
|
238 |
modified_content = modified_content.replace(citation, f"[{citation}]({url})")
|
239 |
|
|
|
|
|
240 |
yield modified_content
|
241 |
|
242 |
yield "\n\n"
|
|
|
247 |
yield "参考资料:\n"
|
248 |
count = 1
|
249 |
for url_content in content_list:
|
250 |
+
url = url_content.get('url')
|
251 |
+
yield "*[{}. {}]({})*".format(str(count), url, url)
|
252 |
+
yield "\n"
|
253 |
+
count += 1
|
|