JohnSmith9982 commited on
Commit
11eb8f3
·
1 Parent(s): 7e611fd

Upload 57 files

Browse files
ChuanhuChatbot.py CHANGED
@@ -15,7 +15,6 @@ from modules.models.models import get_model
15
 
16
  gr.Chatbot._postprocess_chat_messages = postprocess_chat_messages
17
  gr.Chatbot.postprocess = postprocess
18
- PromptHelper.compact_text_chunks = compact_text_chunks
19
 
20
  with open("assets/custom.css", "r", encoding="utf-8") as f:
21
  customCSS = f.read()
@@ -244,7 +243,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
244
  lines=1,
245
  )
246
 
247
- with gr.Accordion(i18n("网络设置"), open=False, visible=False):
248
  # 优先展示自定义的api_host
249
  apihostTxt = gr.Textbox(
250
  show_label=True,
@@ -333,7 +332,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
333
  submitBtn.click(**transfer_input_args).then(**chatgpt_predict_args, api_name="predict").then(**end_outputing_args)
334
  submitBtn.click(**get_usage_args)
335
 
336
- index_files.change(handle_file_upload, [current_model, index_files, chatbot], [index_files, chatbot, status_display])
337
 
338
  emptyBtn.click(
339
  reset,
@@ -467,7 +466,12 @@ demo.title = i18n("川虎Chat 🚀")
467
  if __name__ == "__main__":
468
  reload_javascript()
469
  demo.queue(concurrency_count=CONCURRENT_COUNT).launch(
 
 
 
 
470
  favicon_path="./assets/favicon.ico",
 
471
  )
472
  # demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", server_port=7860, share=False) # 可自定义端口
473
  # demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", server_port=7860,auth=("在这里填写用户名", "在这里填写密码")) # 可设置用户名与密码
 
15
 
16
  gr.Chatbot._postprocess_chat_messages = postprocess_chat_messages
17
  gr.Chatbot.postprocess = postprocess
 
18
 
19
  with open("assets/custom.css", "r", encoding="utf-8") as f:
20
  customCSS = f.read()
 
243
  lines=1,
244
  )
245
 
246
+ with gr.Accordion(i18n("网络设置"), open=False):
247
  # 优先展示自定义的api_host
248
  apihostTxt = gr.Textbox(
249
  show_label=True,
 
332
  submitBtn.click(**transfer_input_args).then(**chatgpt_predict_args, api_name="predict").then(**end_outputing_args)
333
  submitBtn.click(**get_usage_args)
334
 
335
+ index_files.change(handle_file_upload, [current_model, index_files, chatbot, language_select_dropdown], [index_files, chatbot, status_display])
336
 
337
  emptyBtn.click(
338
  reset,
 
466
  if __name__ == "__main__":
467
  reload_javascript()
468
  demo.queue(concurrency_count=CONCURRENT_COUNT).launch(
469
+ server_name=server_name,
470
+ server_port=server_port,
471
+ share=share,
472
+ auth=auth_list if authflag else None,
473
  favicon_path="./assets/favicon.ico",
474
+ inbrowser=not dockerflag, # 禁止在docker下开启inbrowser
475
  )
476
  # demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", server_port=7860, share=False) # 可自定义端口
477
  # demo.queue(concurrency_count=CONCURRENT_COUNT).launch(server_name="0.0.0.0", server_port=7860,auth=("在这里填写用户名", "在这里填写密码")) # 可设置用户名与密码
modules/__pycache__/config.cpython-39.pyc CHANGED
Binary files a/modules/__pycache__/config.cpython-39.pyc and b/modules/__pycache__/config.cpython-39.pyc differ
 
modules/__pycache__/index_func.cpython-39.pyc CHANGED
Binary files a/modules/__pycache__/index_func.cpython-39.pyc and b/modules/__pycache__/index_func.cpython-39.pyc differ
 
modules/__pycache__/overwrites.cpython-39.pyc CHANGED
Binary files a/modules/__pycache__/overwrites.cpython-39.pyc and b/modules/__pycache__/overwrites.cpython-39.pyc differ
 
modules/__pycache__/presets.cpython-39.pyc CHANGED
Binary files a/modules/__pycache__/presets.cpython-39.pyc and b/modules/__pycache__/presets.cpython-39.pyc differ
 
modules/config.py CHANGED
@@ -24,7 +24,8 @@ __all__ = [
24
  "server_name",
25
  "server_port",
26
  "share",
27
- "hide_history_when_not_logged_in"
 
28
  ]
29
 
30
  # 添加一个统一的config文件,避免文件过多造成的疑惑(优先级最低)
@@ -76,6 +77,9 @@ my_api_key = os.environ.get("OPENAI_API_KEY", my_api_key)
76
  xmchat_api_key = config.get("xmchat_api_key", "")
77
  os.environ["XMCHAT_API_KEY"] = xmchat_api_key
78
 
 
 
 
79
  render_latex = config.get("render_latex", True)
80
 
81
  if render_latex:
@@ -102,6 +106,12 @@ api_host = os.environ.get("api_host", config.get("api_host", ""))
102
  if api_host:
103
  shared.state.set_api_host(api_host)
104
 
 
 
 
 
 
 
105
  @contextmanager
106
  def retrieve_openai_api(api_key = None):
107
  old_api_key = os.environ.get("OPENAI_API_KEY", "")
 
24
  "server_name",
25
  "server_port",
26
  "share",
27
+ "hide_history_when_not_logged_in",
28
+ "default_chuanhu_assistant_model"
29
  ]
30
 
31
  # 添加一个统一的config文件,避免文件过多造成的疑惑(优先级最低)
 
77
  xmchat_api_key = config.get("xmchat_api_key", "")
78
  os.environ["XMCHAT_API_KEY"] = xmchat_api_key
79
 
80
+ google_palm_api_key = config.get("google_palm_api_key", "")
81
+ os.environ["GOOGLE_PALM_API_KEY"] = google_palm_api_key
82
+
83
  render_latex = config.get("render_latex", True)
84
 
85
  if render_latex:
 
106
  if api_host:
107
  shared.state.set_api_host(api_host)
108
 
109
+ default_chuanhu_assistant_model = config.get("default_chuanhu_assistant_model", "gpt-3.5-turbo")
110
+ os.environ["GOOGLE_CSE_ID"] = config.get("GOOGLE_CSE_ID", "")
111
+ os.environ["GOOGLE_API_KEY"] = config.get("GOOGLE_API_KEY", "")
112
+ os.environ["WOLFRAM_ALPHA_APPID"] = config.get("WOLFRAM_ALPHA_APPID", "")
113
+ os.environ["SERPAPI_API_KEY"] = config.get("SERPAPI_API_KEY", "")
114
+
115
  @contextmanager
116
  def retrieve_openai_api(api_key = None):
117
  old_api_key = os.environ.get("OPENAI_API_KEY", "")
modules/index_func.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ import colorama
5
+ import PyPDF2
6
+ from tqdm import tqdm
7
+
8
+ from modules.presets import *
9
+ from modules.utils import *
10
+ from modules.config import local_embedding
11
+
12
+
13
+ def get_index_name(file_src):
14
+ file_paths = [x.name for x in file_src]
15
+ file_paths.sort(key=lambda x: os.path.basename(x))
16
+
17
+ md5_hash = hashlib.md5()
18
+ for file_path in file_paths:
19
+ with open(file_path, "rb") as f:
20
+ while chunk := f.read(8192):
21
+ md5_hash.update(chunk)
22
+
23
+ return md5_hash.hexdigest()
24
+
25
+
26
+ def get_documents(file_src):
27
+ from langchain.schema import Document
28
+ from langchain.text_splitter import TokenTextSplitter
29
+ text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=30)
30
+
31
+ documents = []
32
+ logging.debug("Loading documents...")
33
+ logging.debug(f"file_src: {file_src}")
34
+ for file in file_src:
35
+ filepath = file.name
36
+ filename = os.path.basename(filepath)
37
+ file_type = os.path.splitext(filename)[1]
38
+ logging.info(f"loading file: {filename}")
39
+ try:
40
+ if file_type == ".pdf":
41
+ logging.debug("Loading PDF...")
42
+ try:
43
+ from modules.pdf_func import parse_pdf
44
+ from modules.config import advance_docs
45
+
46
+ two_column = advance_docs["pdf"].get("two_column", False)
47
+ pdftext = parse_pdf(filepath, two_column).text
48
+ except:
49
+ pdftext = ""
50
+ with open(filepath, "rb") as pdfFileObj:
51
+ pdfReader = PyPDF2.PdfReader(pdfFileObj)
52
+ for page in tqdm(pdfReader.pages):
53
+ pdftext += page.extract_text()
54
+ texts = Document(page_content=pdftext, metadata={"source": filepath})
55
+ elif file_type == ".docx":
56
+ logging.debug("Loading Word...")
57
+ from langchain.document_loaders import UnstructuredWordDocumentLoader
58
+ loader = UnstructuredWordDocumentLoader(filepath)
59
+ texts = loader.load()
60
+ elif file_type == ".pptx":
61
+ logging.debug("Loading PowerPoint...")
62
+ from langchain.document_loaders import UnstructuredPowerPointLoader
63
+ loader = UnstructuredPowerPointLoader(filepath)
64
+ texts = loader.load()
65
+ elif file_type == ".epub":
66
+ logging.debug("Loading EPUB...")
67
+ from langchain.document_loaders import UnstructuredEPubLoader
68
+ loader = UnstructuredEPubLoader(filepath)
69
+ texts = loader.load()
70
+ elif file_type == ".xlsx":
71
+ logging.debug("Loading Excel...")
72
+ text_list = excel_to_string(filepath)
73
+ for elem in text_list:
74
+ documents.append(Document(page_content=elem, metadata={"source": filepath}))
75
+ continue
76
+ else:
77
+ logging.debug("Loading text file...")
78
+ from langchain.document_loaders import TextLoader
79
+ loader = TextLoader(filepath, "utf8")
80
+ texts = loader.load()
81
+ except Exception as e:
82
+ import traceback
83
+ logging.error(f"Error loading file: {filename}")
84
+ traceback.print_exc()
85
+
86
+ texts = text_splitter.split_documents(texts)
87
+ documents.extend(texts)
88
+ logging.debug("Documents loaded.")
89
+ return documents
90
+
91
+
92
+ def construct_index(
93
+ api_key,
94
+ file_src,
95
+ max_input_size=4096,
96
+ num_outputs=5,
97
+ max_chunk_overlap=20,
98
+ chunk_size_limit=600,
99
+ embedding_limit=None,
100
+ separator=" ",
101
+ ):
102
+ from langchain.chat_models import ChatOpenAI
103
+ from langchain.vectorstores import FAISS
104
+
105
+ if api_key:
106
+ os.environ["OPENAI_API_KEY"] = api_key
107
+ else:
108
+ # 由于一个依赖的愚蠢的设计,这里必须要有一个API KEY
109
+ os.environ["OPENAI_API_KEY"] = "sk-xxxxxxx"
110
+ chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
111
+ embedding_limit = None if embedding_limit == 0 else embedding_limit
112
+ separator = " " if separator == "" else separator
113
+
114
+ index_name = get_index_name(file_src)
115
+ index_path = f"./index/{index_name}"
116
+ if local_embedding:
117
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
118
+ embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2")
119
+ else:
120
+ from langchain.embeddings import OpenAIEmbeddings
121
+ embeddings = OpenAIEmbeddings()
122
+ if os.path.exists(index_path):
123
+ logging.info("找到了缓存的索引文件,加载中……")
124
+ return FAISS.load_local(index_path, embeddings)
125
+ else:
126
+ try:
127
+ documents = get_documents(file_src)
128
+ logging.info("构建索引中……")
129
+ with retrieve_proxy():
130
+ index = FAISS.from_documents(documents, embeddings)
131
+ logging.debug("索引构建完成!")
132
+ os.makedirs("./index", exist_ok=True)
133
+ index.save_local(index_path)
134
+ logging.debug("索引已保存至本地!")
135
+ return index
136
+
137
+ except Exception as e:
138
+ import traceback
139
+ logging.error("索引构建失败!", e)
140
+ traceback.print_exc()
141
+ return None
modules/models/ChuanhuAgent.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.summarize import load_summarize_chain
2
+ from langchain import PromptTemplate, LLMChain
3
+ from langchain.chat_models import ChatOpenAI
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.text_splitter import TokenTextSplitter
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.chains import RetrievalQA
9
+ from langchain.agents import load_tools
10
+ from langchain.agents import initialize_agent
11
+ from langchain.agents import AgentType
12
+ from langchain.docstore.document import Document
13
+ from langchain.tools import BaseTool, StructuredTool, Tool, tool
14
+ from langchain.callbacks.stdout import StdOutCallbackHandler
15
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
16
+ from langchain.callbacks.manager import BaseCallbackManager
17
+
18
+ from typing import Any, Dict, List, Optional, Union
19
+
20
+ from langchain.callbacks.base import BaseCallbackHandler
21
+ from langchain.input import print_text
22
+ from langchain.schema import AgentAction, AgentFinish, LLMResult
23
+
24
+ from pydantic import BaseModel, Field
25
+
26
+ import requests
27
+ from bs4 import BeautifulSoup
28
+ from threading import Thread, Condition
29
+ from collections import deque
30
+
31
+ from .base_model import BaseLLMModel, CallbackToIterator, ChuanhuCallbackHandler
32
+ from ..config import default_chuanhu_assistant_model
33
+ from ..presets import SUMMARIZE_PROMPT, i18n
34
+ from ..index_func import construct_index
35
+
36
+ from langchain.callbacks import get_openai_callback
37
+ import os
38
+ import gradio as gr
39
+ import logging
40
+
41
+ class WebBrowsingInput(BaseModel):
42
+ url: str = Field(description="URL of a webpage")
43
+
44
+ class WebAskingInput(BaseModel):
45
+ url: str = Field(description="URL of a webpage")
46
+ question: str = Field(description="Question that you want to know the answer to, based on the webpage's content.")
47
+
48
+
49
+ class ChuanhuAgent_Client(BaseLLMModel):
50
+ def __init__(self, model_name, openai_api_key, user_name="") -> None:
51
+ super().__init__(model_name=model_name, user=user_name)
52
+ self.text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=30)
53
+ self.api_key = openai_api_key
54
+ self.llm = ChatOpenAI(openai_api_key=openai_api_key, temperature=0, model_name=default_chuanhu_assistant_model)
55
+ self.cheap_llm = ChatOpenAI(openai_api_key=openai_api_key, temperature=0, model_name="gpt-3.5-turbo")
56
+ PROMPT = PromptTemplate(template=SUMMARIZE_PROMPT, input_variables=["text"])
57
+ self.summarize_chain = load_summarize_chain(self.cheap_llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
58
+ self.index_summary = None
59
+ self.index = None
60
+ if "Pro" in self.model_name:
61
+ self.tools = load_tools(["google-search-results-json", "llm-math", "arxiv", "wikipedia", "wolfram-alpha"], llm=self.llm)
62
+ else:
63
+ self.tools = load_tools(["ddg-search", "llm-math", "arxiv", "wikipedia"], llm=self.llm)
64
+
65
+ self.tools.append(
66
+ Tool.from_function(
67
+ func=self.summary_url,
68
+ name="Summary Webpage",
69
+ description="useful when you need to know the overall content of a webpage.",
70
+ args_schema=WebBrowsingInput
71
+ )
72
+ )
73
+
74
+ self.tools.append(
75
+ StructuredTool.from_function(
76
+ func=self.ask_url,
77
+ name="Ask Webpage",
78
+ description="useful when you need to ask detailed questions about a webpage.",
79
+ args_schema=WebAskingInput
80
+ )
81
+ )
82
+
83
+ def handle_file_upload(self, files, chatbot, language):
84
+ """if the model accepts multi modal input, implement this function"""
85
+ status = gr.Markdown.update()
86
+ if files:
87
+ index = construct_index(self.api_key, file_src=files)
88
+ assert index is not None, "获取索引失败"
89
+ self.index = index
90
+ status = i18n("索引构建完成")
91
+ # Summarize the document
92
+ logging.info(i18n("生成内容总结中……"))
93
+ with get_openai_callback() as cb:
94
+ os.environ["OPENAI_API_KEY"] = self.api_key
95
+ from langchain.chains.summarize import load_summarize_chain
96
+ from langchain.prompts import PromptTemplate
97
+ from langchain.chat_models import ChatOpenAI
98
+ prompt_template = "Write a concise summary of the following:\n\n{text}\n\nCONCISE SUMMARY IN " + language + ":"
99
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
100
+ llm = ChatOpenAI()
101
+ chain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
102
+ summary = chain({"input_documents": list(index.docstore.__dict__["_dict"].values())}, return_only_outputs=True)["output_text"]
103
+ logging.info(f"Summary: {summary}")
104
+ self.index_summary = summary
105
+ logging.info(cb)
106
+ return gr.Files.update(), chatbot, status
107
+
108
+ def query_index(self, query):
109
+ if self.index is not None:
110
+ retriever = self.index.as_retriever()
111
+ qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=retriever)
112
+ return qa.run(query)
113
+ else:
114
+ "Error during query."
115
+
116
+ def summary(self, text):
117
+ texts = Document(page_content=text)
118
+ texts = self.text_splitter.split_documents([texts])
119
+ return self.summarize_chain({"input_documents": texts}, return_only_outputs=True)["output_text"]
120
+
121
+ def fetch_url_content(self, url):
122
+ response = requests.get(url)
123
+ soup = BeautifulSoup(response.text, 'html.parser')
124
+
125
+ # 提取所有的文本
126
+ text = ''.join(s.getText() for s in soup.find_all('p'))
127
+ logging.info(f"Extracted text from {url}")
128
+ return text
129
+
130
+ def summary_url(self, url):
131
+ text = self.fetch_url_content(url)
132
+ text_summary = self.summary(text)
133
+ url_content = "webpage content summary:\n" + text_summary
134
+
135
+ return url_content
136
+
137
+ def ask_url(self, url, question):
138
+ text = self.fetch_url_content(url)
139
+ texts = Document(page_content=text)
140
+ texts = self.text_splitter.split_documents([texts])
141
+ # use embedding
142
+ embeddings = OpenAIEmbeddings(openai_api_key=self.api_key)
143
+
144
+ # create vectorstore
145
+ db = FAISS.from_documents(texts, embeddings)
146
+ retriever = db.as_retriever()
147
+ qa = RetrievalQA.from_chain_type(llm=self.cheap_llm, chain_type="stuff", retriever=retriever)
148
+ return qa.run(f"{question} Reply in 中文")
149
+
150
+ def get_answer_at_once(self):
151
+ question = self.history[-1]["content"]
152
+ # llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
153
+ agent = initialize_agent(self.tools, self.llm, agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
154
+ reply = agent.run(input=f"{question} Reply in 简体中文")
155
+ return reply, -1
156
+
157
+ def get_answer_stream_iter(self):
158
+ question = self.history[-1]["content"]
159
+ it = CallbackToIterator()
160
+ manager = BaseCallbackManager(handlers=[ChuanhuCallbackHandler(it.callback)])
161
+ def thread_func():
162
+ tools = self.tools
163
+ if self.index is not None:
164
+ tools.append(
165
+ Tool.from_function(
166
+ func=self.query_index,
167
+ name="Query Knowledge Base",
168
+ description=f"useful when you need to know about: {self.index_summary}",
169
+ args_schema=WebBrowsingInput
170
+ )
171
+ )
172
+ agent = initialize_agent(self.tools, self.llm, agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True, callback_manager=manager)
173
+ reply = agent.run(input=f"{question} Reply in 简体中文")
174
+ it.callback(reply)
175
+ it.finish()
176
+ t = Thread(target=thread_func)
177
+ t.start()
178
+ partial_text = ""
179
+ for value in it:
180
+ partial_text += value
181
+ yield partial_text
modules/models/PaLM.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_model import BaseLLMModel, CallbackToIterator, ChuanhuCallbackHandler
2
+ from langchain.chat_models import ChatGooglePalm
3
+ import os
4
+
5
+ class PaLM_Client(BaseLLMModel):
6
+ def __init__(self, model_name, user="") -> None:
7
+ super().__init__(model_name, user)
8
+ self.llm = ChatGooglePalm(google_api_key=os.environ["GOOGLE_PALM_API_KEY"])
9
+
10
+ def get_answer_at_once(self):
11
+ self.llm.generate(self.history)
modules/models/__pycache__/ChuanhuAgent.cpython-39.pyc CHANGED
Binary files a/modules/models/__pycache__/ChuanhuAgent.cpython-39.pyc and b/modules/models/__pycache__/ChuanhuAgent.cpython-39.pyc differ
 
modules/models/__pycache__/base_model.cpython-39.pyc CHANGED
Binary files a/modules/models/__pycache__/base_model.cpython-39.pyc and b/modules/models/__pycache__/base_model.cpython-39.pyc differ
 
modules/models/__pycache__/models.cpython-39.pyc CHANGED
Binary files a/modules/models/__pycache__/models.cpython-39.pyc and b/modules/models/__pycache__/models.cpython-39.pyc differ
 
modules/models/base_model.py CHANGED
@@ -18,12 +18,85 @@ import asyncio
18
  import aiohttp
19
  from enum import Enum
20
 
 
 
 
 
 
 
 
 
 
 
 
21
  from ..presets import *
22
- from ..llama_func import *
23
  from ..utils import *
24
  from .. import shared
25
  from ..config import retrieve_proxy
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  class ModelType(Enum):
29
  Unknown = -1
@@ -34,6 +107,8 @@ class ModelType(Enum):
34
  StableLM = 4
35
  MOSS = 5
36
  YuanAI = 6
 
 
37
 
38
  @classmethod
39
  def get_type(cls, model_name: str):
@@ -53,6 +128,10 @@ class ModelType(Enum):
53
  model_type = ModelType.MOSS
54
  elif "yuanai" in model_name_lower:
55
  model_type = ModelType.YuanAI
 
 
 
 
56
  else:
57
  model_type = ModelType.Unknown
58
  return model_type
@@ -178,12 +257,12 @@ class BaseLLMModel:
178
  status_text = self.token_message()
179
  return chatbot, status_text
180
 
181
- def handle_file_upload(self, files, chatbot):
182
  """if the model accepts multi modal input, implement this function"""
183
  status = gr.Markdown.update()
184
  if files:
185
- construct_index(self.api_key, file_src=files)
186
- status = "索引构建完成"
187
  return gr.Files.update(), chatbot, status
188
 
189
  def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):
@@ -192,53 +271,20 @@ class BaseLLMModel:
192
  limited_context = False
193
  fake_inputs = real_inputs
194
  if files:
195
- from llama_index.indices.vector_store.base_query import GPTVectorStoreIndexQuery
196
- from llama_index.indices.query.schema import QueryBundle
197
  from langchain.embeddings.huggingface import HuggingFaceEmbeddings
198
- from langchain.chat_models import ChatOpenAI
199
- from llama_index import (
200
- GPTSimpleVectorIndex,
201
- ServiceContext,
202
- LangchainEmbedding,
203
- OpenAIEmbedding,
204
- )
205
  limited_context = True
206
  msg = "加载索引中……"
207
  logging.info(msg)
208
- # yield chatbot + [(inputs, "")], msg
209
  index = construct_index(self.api_key, file_src=files)
210
  assert index is not None, "获取索引失败"
211
  msg = "索引获取成功,���成回答中……"
212
  logging.info(msg)
213
- if local_embedding or self.model_type != ModelType.OpenAI:
214
- embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2"))
215
- else:
216
- embed_model = OpenAIEmbedding()
217
- # yield chatbot + [(inputs, "")], msg
218
  with retrieve_proxy():
219
- prompt_helper = PromptHelper(
220
- max_input_size=4096,
221
- num_output=5,
222
- max_chunk_overlap=20,
223
- chunk_size_limit=600,
224
- )
225
- from llama_index import ServiceContext
226
-
227
- service_context = ServiceContext.from_defaults(
228
- prompt_helper=prompt_helper, embed_model=embed_model
229
- )
230
- query_object = GPTVectorStoreIndexQuery(
231
- index.index_struct,
232
- service_context=service_context,
233
- similarity_top_k=5,
234
- vector_store=index._vector_store,
235
- docstore=index._docstore,
236
- response_synthesizer=None
237
- )
238
- query_bundle = QueryBundle(real_inputs)
239
- nodes = query_object.retrieve(query_bundle)
240
- reference_results = [n.node.text for n in nodes]
241
- reference_results = add_source_numbers(reference_results, use_source=False)
242
  display_append = add_details(reference_results)
243
  display_append = "\n\n" + "".join(display_append)
244
  real_inputs = (
 
18
  import aiohttp
19
  from enum import Enum
20
 
21
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
22
+ from langchain.callbacks.manager import BaseCallbackManager
23
+
24
+ from typing import Any, Dict, List, Optional, Union
25
+
26
+ from langchain.callbacks.base import BaseCallbackHandler
27
+ from langchain.input import print_text
28
+ from langchain.schema import AgentAction, AgentFinish, LLMResult
29
+ from threading import Thread, Condition
30
+ from collections import deque
31
+
32
  from ..presets import *
33
+ from ..index_func import *
34
  from ..utils import *
35
  from .. import shared
36
  from ..config import retrieve_proxy
37
 
38
+ class CallbackToIterator:
39
+ def __init__(self):
40
+ self.queue = deque()
41
+ self.cond = Condition()
42
+ self.finished = False
43
+
44
+ def callback(self, result):
45
+ with self.cond:
46
+ self.queue.append(result)
47
+ self.cond.notify() # Wake up the generator.
48
+
49
+ def __iter__(self):
50
+ return self
51
+
52
+ def __next__(self):
53
+ with self.cond:
54
+ while not self.queue and not self.finished: # Wait for a value to be added to the queue.
55
+ self.cond.wait()
56
+ if not self.queue:
57
+ raise StopIteration()
58
+ return self.queue.popleft()
59
+
60
+ def finish(self):
61
+ with self.cond:
62
+ self.finished = True
63
+ self.cond.notify() # Wake up the generator if it's waiting.
64
+
65
+ class ChuanhuCallbackHandler(BaseCallbackHandler):
66
+
67
+ def __init__(self, callback) -> None:
68
+ """Initialize callback handler."""
69
+ self.callback = callback
70
+
71
+ def on_agent_action(
72
+ self, action: AgentAction, color: Optional[str] = None, **kwargs: Any
73
+ ) -> Any:
74
+ self.callback(action.log)
75
+
76
+ def on_tool_end(
77
+ self,
78
+ output: str,
79
+ color: Optional[str] = None,
80
+ observation_prefix: Optional[str] = None,
81
+ llm_prefix: Optional[str] = None,
82
+ **kwargs: Any,
83
+ ) -> None:
84
+ """If not the final action, print out observation."""
85
+ if observation_prefix is not None:
86
+ self.callback(f"\n\n{observation_prefix}")
87
+ self.callback(output)
88
+ if llm_prefix is not None:
89
+ self.callback(f"\n\n{llm_prefix}")
90
+
91
+ def on_agent_finish(
92
+ self, finish: AgentFinish, color: Optional[str] = None, **kwargs: Any
93
+ ) -> None:
94
+ self.callback(f"{finish.log}\n\n")
95
+
96
+ def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
97
+ """Run on new LLM token. Only available when streaming is enabled."""
98
+ self.callback(token)
99
+
100
 
101
  class ModelType(Enum):
102
  Unknown = -1
 
107
  StableLM = 4
108
  MOSS = 5
109
  YuanAI = 6
110
+ ChuanhuAgent = 7
111
+ PaLM = 8
112
 
113
  @classmethod
114
  def get_type(cls, model_name: str):
 
128
  model_type = ModelType.MOSS
129
  elif "yuanai" in model_name_lower:
130
  model_type = ModelType.YuanAI
131
+ elif "川虎助理" in model_name_lower:
132
+ model_type = ModelType.ChuanhuAgent
133
+ elif "palm" in model_name_lower:
134
+ model_type = ModelType.PaLM
135
  else:
136
  model_type = ModelType.Unknown
137
  return model_type
 
257
  status_text = self.token_message()
258
  return chatbot, status_text
259
 
260
+ def handle_file_upload(self, files, chatbot, language):
261
  """if the model accepts multi modal input, implement this function"""
262
  status = gr.Markdown.update()
263
  if files:
264
+ index = construct_index(self.api_key, file_src=files)
265
+ status = i18n("索引构建完成")
266
  return gr.Files.update(), chatbot, status
267
 
268
  def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):
 
271
  limited_context = False
272
  fake_inputs = real_inputs
273
  if files:
 
 
274
  from langchain.embeddings.huggingface import HuggingFaceEmbeddings
275
+ from langchain.vectorstores.base import VectorStoreRetriever
 
 
 
 
 
 
276
  limited_context = True
277
  msg = "加载索引中……"
278
  logging.info(msg)
 
279
  index = construct_index(self.api_key, file_src=files)
280
  assert index is not None, "获取索引失败"
281
  msg = "索引获取成功,���成回答中……"
282
  logging.info(msg)
 
 
 
 
 
283
  with retrieve_proxy():
284
+ retriever = VectorStoreRetriever(vectorstore=index, search_type="similarity_score_threshold",search_kwargs={"k":6, "score_threshold": 0.5})
285
+ relevant_documents = retriever.get_relevant_documents(real_inputs)
286
+ reference_results = [[d.page_content.strip("�"), os.path.basename(d.metadata["source"])] for d in relevant_documents]
287
+ reference_results = add_source_numbers(reference_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  display_append = add_details(reference_results)
289
  display_append = "\n\n" + "".join(display_append)
290
  real_inputs = (
modules/models/models.py CHANGED
@@ -22,7 +22,7 @@ from enum import Enum
22
  import uuid
23
 
24
  from ..presets import *
25
- from ..llama_func import *
26
  from ..utils import *
27
  from .. import shared
28
  from ..config import retrieve_proxy, usage_limit
@@ -494,7 +494,7 @@ class XMChat(BaseLLMModel):
494
  limited_context = False
495
  return limited_context, fake_inputs, display_append, real_inputs, chatbot
496
 
497
- def handle_file_upload(self, files, chatbot):
498
  """if the model accepts multi modal input, implement this function"""
499
  if files:
500
  for file in files:
@@ -557,6 +557,7 @@ def get_model(
557
  config.local_embedding = True
558
  # del current_model.model
559
  model = None
 
560
  try:
561
  if model_type == ModelType.OpenAI:
562
  logging.info(f"正在加载OpenAI模型: {model_name}")
@@ -602,10 +603,15 @@ def get_model(
602
  elif model_type == ModelType.YuanAI:
603
  from .inspurai import Yuan_Client
604
  model = Yuan_Client(model_name, api_key=access_key, user_name=user_name, system_prompt=system_prompt)
 
 
 
 
 
 
605
  elif model_type == ModelType.Unknown:
606
  raise ValueError(f"未知模型: {model_name}")
607
  logging.info(msg)
608
- chatbot = gr.Chatbot.update(label=model_name)
609
  except Exception as e:
610
  logging.error(e)
611
  msg = f"{STANDARD_ERROR_MSG}: {e}"
 
22
  import uuid
23
 
24
  from ..presets import *
25
+ from ..index_func import *
26
  from ..utils import *
27
  from .. import shared
28
  from ..config import retrieve_proxy, usage_limit
 
494
  limited_context = False
495
  return limited_context, fake_inputs, display_append, real_inputs, chatbot
496
 
497
+ def handle_file_upload(self, files, chatbot, language):
498
  """if the model accepts multi modal input, implement this function"""
499
  if files:
500
  for file in files:
 
557
  config.local_embedding = True
558
  # del current_model.model
559
  model = None
560
+ chatbot = gr.Chatbot.update(label=model_name)
561
  try:
562
  if model_type == ModelType.OpenAI:
563
  logging.info(f"正在加载OpenAI模型: {model_name}")
 
603
  elif model_type == ModelType.YuanAI:
604
  from .inspurai import Yuan_Client
605
  model = Yuan_Client(model_name, api_key=access_key, user_name=user_name, system_prompt=system_prompt)
606
+ elif model_type == ModelType.ChuanhuAgent:
607
+ from .ChuanhuAgent import ChuanhuAgent_Client
608
+ model = ChuanhuAgent_Client(model_name, access_key, user_name=user_name)
609
+ elif model_type == ModelType.PaLM:
610
+ from .PaLM import PaLM_Client
611
+ model = PaLM_Client(model_name, user_name=user_name)
612
  elif model_type == ModelType.Unknown:
613
  raise ValueError(f"未知模型: {model_name}")
614
  logging.info(msg)
 
615
  except Exception as e:
616
  logging.error(e)
617
  msg = f"{STANDARD_ERROR_MSG}: {e}"
modules/overwrites.py CHANGED
@@ -1,24 +1,14 @@
1
  from __future__ import annotations
2
  import logging
3
 
4
- from llama_index import Prompt
5
  from typing import List, Tuple
6
  import mdtex2html
7
  from gradio_client import utils as client_utils
8
 
9
  from modules.presets import *
10
- from modules.llama_func import *
11
  from modules.config import render_latex
12
 
13
- def compact_text_chunks(self, prompt: Prompt, text_chunks: List[str]) -> List[str]:
14
- logging.debug("Compacting text chunks...🚀🚀🚀")
15
- combined_str = [c.strip() for c in text_chunks if c.strip()]
16
- combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
17
- combined_str = "\n\n".join(combined_str)
18
- # resplit based on self.max_chunk_overlap
19
- text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
20
- return text_splitter.split_text(combined_str)
21
-
22
 
23
  def postprocess(
24
  self,
 
1
  from __future__ import annotations
2
  import logging
3
 
 
4
  from typing import List, Tuple
5
  import mdtex2html
6
  from gradio_client import utils as client_utils
7
 
8
  from modules.presets import *
9
+ from modules.index_func import *
10
  from modules.config import render_latex
11
 
 
 
 
 
 
 
 
 
 
12
 
13
  def postprocess(
14
  self,
modules/pdf_func.py CHANGED
@@ -1,11 +1,11 @@
1
  from types import SimpleNamespace
2
  import pdfplumber
3
  import logging
4
- from llama_index import Document
5
 
6
  def prepare_table_config(crop_page):
7
  """Prepare table查找边界, 要求page为原始page
8
-
9
  From https://github.com/jsvine/pdfplumber/issues/242
10
  """
11
  page = crop_page.root_page # root/parent
@@ -60,7 +60,7 @@ def get_title_with_cropped_page(first_page):
60
  title_bottom = word.bottom
61
  elif word.text == "Abstract": # 获取页面abstract
62
  top = word.top
63
-
64
  user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
65
  # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
66
  return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
@@ -75,7 +75,7 @@ def get_column_cropped_pages(pages, two_column=True):
75
  new_pages.append(right)
76
  else:
77
  new_pages.append(page)
78
-
79
  return new_pages
80
 
81
  def parse_pdf(filename, two_column = True):
@@ -94,7 +94,7 @@ def parse_pdf(filename, two_column = True):
94
  name_top=name_top,
95
  name_bottom=name_bottom,
96
  record_chapter_name = True,
97
-
98
  page_start=page_start,
99
  page_stop=None,
100
 
@@ -114,7 +114,7 @@ def parse_pdf(filename, two_column = True):
114
  if word.size >= 11: # 出现chapter name
115
  if cur_chapter is None:
116
  cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
117
- elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
118
  # 不再继续写chapter name
119
  cur_chapter.page_stop = page.page_number # stop id
120
  chapters.append(cur_chapter)
@@ -143,7 +143,7 @@ def parse_pdf(filename, two_column = True):
143
  text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
144
 
145
  logging.getLogger().setLevel(level)
146
- return Document(text=text, extra_info={"title": title})
147
 
148
  BASE_POINTS = """
149
  1. Who are the authors?
 
1
  from types import SimpleNamespace
2
  import pdfplumber
3
  import logging
4
+ from langchain.docstore.document import Document
5
 
6
  def prepare_table_config(crop_page):
7
  """Prepare table查找边界, 要求page为原始page
8
+
9
  From https://github.com/jsvine/pdfplumber/issues/242
10
  """
11
  page = crop_page.root_page # root/parent
 
60
  title_bottom = word.bottom
61
  elif word.text == "Abstract": # 获取页面abstract
62
  top = word.top
63
+
64
  user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
65
  # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
66
  return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
 
75
  new_pages.append(right)
76
  else:
77
  new_pages.append(page)
78
+
79
  return new_pages
80
 
81
  def parse_pdf(filename, two_column = True):
 
94
  name_top=name_top,
95
  name_bottom=name_bottom,
96
  record_chapter_name = True,
97
+
98
  page_start=page_start,
99
  page_stop=None,
100
 
 
114
  if word.size >= 11: # 出现chapter name
115
  if cur_chapter is None:
116
  cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
117
+ elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
118
  # 不再继续写chapter name
119
  cur_chapter.page_stop = page.page_number # stop id
120
  chapters.append(cur_chapter)
 
143
  text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
144
 
145
  logging.getLogger().setLevel(level)
146
+ return Document(page_content=text, metadata={"title": title})
147
 
148
  BASE_POINTS = """
149
  1. Who are the authors?
modules/presets.py CHANGED
@@ -58,9 +58,9 @@ APPEARANCE_SWITCHER = """
58
  </div>
59
  """
60
 
61
- SUMMARIZE_PROMPT = "你是谁?我们刚才聊了什么?" # 总结对话时的 prompt
62
-
63
  ONLINE_MODELS = [
 
 
64
  "gpt-3.5-turbo",
65
  "gpt-3.5-turbo-0301",
66
  "gpt-4",
@@ -68,6 +68,7 @@ ONLINE_MODELS = [
68
  "gpt-4-32k",
69
  "gpt-4-32k-0314",
70
  "xmchat",
 
71
  "yuanai-1.0-base_10B",
72
  "yuanai-1.0-translate",
73
  "yuanai-1.0-dialog",
@@ -164,6 +165,12 @@ Reply in {reply_language}
164
  If the context isn't useful, return the original answer.
165
  """
166
 
 
 
 
 
 
 
167
  ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
168
 
169
  small_and_beautiful_theme = gr.themes.Soft(
 
58
  </div>
59
  """
60
 
 
 
61
  ONLINE_MODELS = [
62
+ "川虎助理",
63
+ "川虎助理 Pro",
64
  "gpt-3.5-turbo",
65
  "gpt-3.5-turbo-0301",
66
  "gpt-4",
 
68
  "gpt-4-32k",
69
  "gpt-4-32k-0314",
70
  "xmchat",
71
+ "Google PaLM",
72
  "yuanai-1.0-base_10B",
73
  "yuanai-1.0-translate",
74
  "yuanai-1.0-dialog",
 
165
  If the context isn't useful, return the original answer.
166
  """
167
 
168
+ SUMMARIZE_PROMPT = """Write a concise summary of the following:
169
+
170
+ {text}
171
+
172
+ CONCISE SUMMARY IN 中文:"""
173
+
174
  ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
175
 
176
  small_and_beautiful_theme = gr.themes.Soft(
requirements.txt CHANGED
@@ -8,11 +8,20 @@ tqdm
8
  colorama
9
  duckduckgo_search==2.9.5
10
  Pygments
11
- llama_index==0.5.25
12
- langchain<0.0.150
13
  markdown
14
  PyPDF2
15
  pdfplumber
16
  pandas
17
  commentjson
18
  openpyxl
 
 
 
 
 
 
 
 
 
 
 
8
  colorama
9
  duckduckgo_search==2.9.5
10
  Pygments
11
+ langchain==0.0.170
 
12
  markdown
13
  PyPDF2
14
  pdfplumber
15
  pandas
16
  commentjson
17
  openpyxl
18
+ pandoc
19
+ wolframalpha
20
+ faiss-cpu
21
+ google-search-results
22
+ arxiv
23
+ wikipedia
24
+ google.generativeai
25
+ openai
26
+ unstructured
27
+ google-api-python-client